In [None]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

# Load the further cleaned data
df = pd.read_csv('further_cleaned_data.csv')

# Ensure all entries in the tweet column are strings
df['tweet'] = df['tweet'].astype(str)

# Fill any NaNs or missing values with an empty string
df['tweet'] = df['tweet'].fillna('')

In [None]:
# Display basic information and first few rows
print(df.info())
print(df.head())

In [None]:
# Check the distribution of each class
label_counts = df['class'].value_counts()
print("Distribution of labels:")
print(label_counts)

In [None]:
# Plot the distribution of labels
label_counts.plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Distribution of Labels')
plt.show()

In [None]:
# Function to get the top words in each class
def get_top_words(texts, n=10):
    all_words = ' '.join(texts).split()
    word_counts = Counter(all_words)
    return word_counts.most_common(n)

# Analyze the most frequent words in each class
for label in df['class'].unique():
    texts = df[df['class'] == label]['tweet']
    top_words = get_top_words(texts)
    print(f"Top words in class {label}:")
    print(top_words)
    print()
