In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import warnings
import numpy as np

# Force display of warnings
warnings.filterwarnings('always')

def clean_tokens(tokens):
    if isinstance(tokens, str):
        return tokens
    elif isinstance(tokens, float) and np.isnan(tokens):
        return ''
    else:
        try:
            return ' '.join(str(token) for token in tokens if str(token) != 'nan')
        except:
            return ''

def plot_word_cloud(tokens, title):
    print(f"Generating word cloud for {title}...")
    wordcloud = WordCloud(width=800, height=400, background_color ='white').generate(tokens)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.savefig(f'{title.lower().replace(" ", "_")}.png')
    plt.close()
    print(f"Word cloud for {title} saved.")

def plot_word_frequency(tokens, title, num_words=20):
    print(f"Generating word frequency plot for {title}...")
    word_freq = Counter(tokens.split())
    common_words = word_freq.most_common(num_words)
    words, counts = zip(*common_words)
    plt.figure(figsize=(10, 5))
    plt.bar(words, counts)
    plt.xticks(rotation=90)
    plt.title(title)
    plt.savefig(f'{title.lower().replace(" ", "_")}.png')
    plt.close()
    print(f"Word frequency plot for {title} saved.")

def main():
    print("Starting script...")
    print("Loading data...")
    # Read the CSV file
    df = pd.read_csv('../data/processed/cleaned_tokenized_sentiment140.csv')
    print("Data loaded. Shape:", df.shape)
    print(df.head())

    # Clean tokens
    df['tokens'] = df['tokens'].apply(clean_tokens)

    # Separate positive and negative tweets
    print("Separating positive and negative tweets...")
    positive_tweets = ' '.join(df[df['target'] == 4]['tokens'].dropna())
    negative_tweets = ' '.join(df[df['target'] == 0]['tokens'].dropna())
    print("Tweets separated.")

    # Plot word cloud for positive tweets
    plot_word_cloud(positive_tweets, 'Most Common Words in Positive Tweets')

    # Plot word cloud for negative tweets
    plot_word_cloud(negative_tweets, 'Most Common Words in Negative Tweets')

    # Plot word frequency for positive tweets
    plot_word_frequency(positive_tweets, 'Top 20 Words in Positive Tweets')

    # Plot word frequency for negative tweets
    plot_word_frequency(negative_tweets, 'Top 20 Words in Negative Tweets')

    # Distribution of sentiments
    print("Plotting sentiment distribution...")
    df['target'].value_counts().plot(kind='bar', title='Distribution of Sentiments')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.xticks(ticks=[0, 1], labels=['Negative', 'Positive'], rotation=0)
    plt.savefig('sentiment_distribution.png')
    plt.close()
    print("Sentiment distribution plot saved.")

    # Length of tweets
    print("Analyzing tweet lengths...")
    df['tweet_length'] = df['tokens'].apply(lambda x: len(str(x).split()))
    df['tweet_length'].hist(bins=50)
    plt.title('Distribution of Tweet Length')
    plt.xlabel('Tweet Length')
    plt.ylabel('Frequency')
    plt.savefig('tweet_length_distribution.png')
    plt.close()
    print("Tweet length distribution plot saved.")

    print("Script completed.")

if __name__ == '__main__':
    main()

Starting script...
Loading data...
Data loaded. Shape: (100000, 8)
   target         ids                          date      flag  \
0       0  2200003196  Tue Jun 16 18:18:12 PDT 2009  NO_QUERY   
1       0  1467998485  Mon Apr 06 23:11:14 PDT 2009  NO_QUERY   
2       0  2300048954  Tue Jun 23 13:40:11 PDT 2009  NO_QUERY   
3       0  1993474027  Mon Jun 01 10:26:07 PDT 2009  NO_QUERY   
4       0  2256550904  Sat Jun 20 12:56:51 PDT 2009  NO_QUERY   

              user                                               text  \
0  LaLaLindsey0609             @chrishasboobs AHHH I HOPE YOUR OK!!!    
1      sexygrneyes  @misstoriblack cool , i have no tweet apps  fo...   
2       sammydearr  @TiannaChaos i know  just family drama. its la...   
3      Lamb_Leanne  School email won't open  and I have geography ...   
4      yogicerdito                             upper airways problem    

                                        cleaned_text  \
0                               ahhh i hope you