In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install wordninja
!pip install pyspellchecker

In [None]:
import re
import random
import math
from tqdm.notebook import tqdm
from collections import Counter


from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt 

import wordninja
from spellchecker import SpellChecker
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))  
stop_words.add("amp")

# Preprocessing

In [None]:
# data = pd.read_csv("/kaggle/input/80000-tweets-from-us-capitol-riotsjan-6-2021/tweets.csv")
data = pd.read_csv("/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv")

# data contains one non-string entry for 'text'
str_mask = [isinstance(x, str) for x in data.text]
data = data[str_mask]

In [None]:
# standard tweet preprocessing 

data.text =data.text.str.lower()
#Remove twitter handlers
data.text = data.text.apply(lambda x:re.sub('@[^\s]+','',x))
#remove hashtags
data.text = data.text.apply(lambda x:re.sub(r'\B#\S+','',x))
# Remove URLS
data.text = data.text.apply(lambda x:re.sub(r"http\S+", "", x))
# Remove all the special characters
data.text = data.text.apply(lambda x:' '.join(re.findall(r'\w+', x)))
#remove all single characters
data.text = data.text.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
# Substituting multiple spaces with single space
data.text = data.text.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
# Convert string to a list of words
data['words'] = data.text.apply(lambda x:re.findall(r'\w+', x ))

# VADER Sentiment Analysis 
Refer [this notebook](https://www.kaggle.com/pawanbhandarkar/training-a-sith-lord) to understand why I select 0.35 and -0.05 as the threshold values. We then sort them into classes based on where the compound sentiment lies with respect to the chosen threshold.

- sentiment['compound'] < -0.05 => Negative (-1)
- -0.05 < sentiment['compound'] < 0.35 => Neutral (0)
- sentiment['compound'] > 0.35 => Positive (1)


In [None]:
# Helper functions 
def get_sign(x, p, n):
    if x > p:
        return 1
    if x < n:
        return -1 
    return 0

def flatten_list(l):
    return [x for y in l for x in y]

In [None]:
sia = SIA()

sentiments = [sia.polarity_scores(x)['compound'] for x in tqdm(data['text'])]
classes = [get_sign(s, 0.35, -0.05) for s in sentiments]
data['classes'] = classes

In [None]:
def is_acceptable(word: str):
    return word not in stop_words and len(word) > 2

In [None]:
# Create one document each for all words in the negative, neutral and  positive classes respectively
neg_doc = flatten_list(data[data['classes'] == -1]['words'])
neg_doc = [x for x in neg_doc if is_acceptable(x)]

pos_doc = flatten_list(data[data['classes'] == +1]['words'])
pos_doc = [x for x in pos_doc if is_acceptable(x)]

neu_doc = flatten_list(data[data['classes'] == 0]['words'])
neu_doc = [x for x in neu_doc if is_acceptable(x)]

In [None]:
# color coding our wordclouds 
def red_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
    return f"hsl(0, 100%, {random.randint(25, 75)}%)" 

def green_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
    return f"hsl({random.randint(90, 150)}, 100%, 30%)" 

def yellow_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
    return f"hsl(42, 100%, {random.randint(25, 50)}%)" 

# Naive Word Clouds

As you can see, Naively generating word clouds based on word frequencies alone do not capture any useful information. 

In [None]:
# reusable function to generate word clouds 
def generate_word_clouds(neg_doc, neu_doc, pos_doc):
    # Display the generated image:
    fig, axes = plt.subplots(1,3, figsize=(20,10))
    
    
    wordcloud_neg = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(" ".join(neg_doc))
    axes[0].imshow(wordcloud_neg.recolor(color_func=red_color_func, random_state=3), interpolation='bilinear')
    axes[0].set_title("Negative Tweets")
    axes[0].axis("off")

    wordcloud_neu = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(" ".join(neu_doc))
    axes[1].imshow(wordcloud_neu.recolor(color_func=yellow_color_func, random_state=3), interpolation='bilinear')
    axes[1].set_title("Neutral Words")
    axes[1].axis("off")

    wordcloud_pos = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(" ".join(pos_doc))
    axes[2].imshow(wordcloud_pos.recolor(color_func=green_color_func, random_state=3), interpolation='bilinear')
    axes[2].set_title("Positive Words")
    axes[2].axis("off")

    plt.tight_layout()
    plt.show();

In [None]:
# Naive word clouds 
generate_word_clouds(neg_doc, neu_doc, pos_doc)

In [None]:
def get_top_percent_words(doc, percent):
    # returns a list of "top-n" most frequent words in a list 
    top_n = int(percent * len(set(doc)))
    counter = Counter(doc).most_common(top_n)
    top_n_words = [x[0] for x in counter]
    
    return top_n_words
    
def clean_document(doc):
    spell = SpellChecker()
    lemmatizer = WordNetLemmatizer()
    
    # lemmatize words (needed for calculating frequencies correctly )
    doc = [lemmatizer.lemmatize(x) for x in doc]
    
    # get the top 10% of all words. This may include "misspelled" words 
    top_n_words = get_top_percent_words(doc, 0.1)

    # get a list of misspelled words 
    misspelled = spell.unknown(doc)
    
    # accept the correctly spelled words and top_n words 
    clean_words = [x for x in doc if x not in misspelled or x in top_n_words]
    
    # try to split the misspelled words to generate good words (ex. "lifeisstrange" -> ["life", "is", "strange"])
    words_to_split = [x for x in doc if x in misspelled and x not in top_n_words]
    split_words = flatten_list([wordninja.split(x) for x in words_to_split])
    
    # some splits may be nonsensical, so reject them ("llouis" -> ['ll', 'ou', "is"])
    clean_words.extend(spell.known(split_words))
    
    return clean_words

# Calculate Log Likelihood

![](https://latex.codecogs.com/gif.latex?%5CLambda%20_%7Bi%7D%20%3Dlog%20%5Cleft%20%5B%5Cfrac%7BP%28w_%7Bi%7D%7C%20positive%29%7D%7BP%28w_%7Bi%7D%20%7C%20negative%29%7D%20%5Cright%20%5D)

Since we have 3 classes, we use the above formula to calculate the log likelihood for each class by treating one as positive the other two as negative. 

In [None]:
def get_log_likelihood(doc1, doc2):    

    doc1_counts = Counter(doc1)
    doc1_freq = {
        x: doc1_counts[x]/len(doc1)
        for x in doc1_counts
    }
    
    doc2_counts = Counter(doc2)
    doc2_freq = {
        x: doc2_counts[x]/len(doc2)
        for x in doc2_counts
    }
    
    doc_ratios = {
        # 1 is added to prevent division by 0
        x: math.log((doc1_freq[x] +1 )/(doc2_freq[x]+1))
        for x in doc1_freq if x in doc2_freq
    }
    
    top_ratios = Counter(doc_ratios).most_common()
    top_percent = int(0.1 * len(top_ratios))
    return top_ratios[:top_percent]

In [None]:
# clean all the documents
neg_doc_clean = clean_document(neg_doc)
neu_doc_clean = clean_document(neu_doc)
pos_doc_clean = clean_document(pos_doc)

# combine classes B and C to compare against A (ex. "positive" vs "non-positive")
top_neg_words = get_log_likelihood(neg_doc_clean, flatten_list([pos_doc_clean, neu_doc_clean]))
top_neu_words = get_log_likelihood(neu_doc_clean, flatten_list([pos_doc_clean, neg_doc_clean]))
top_pos_words = get_log_likelihood(pos_doc_clean, flatten_list([neu_doc_clean, neg_doc_clean]))

In [None]:
# visualize top-5 neg and their LL values
top_neg_words[:5]

In [None]:
# visualize top-5 neu and their LL values
top_neu_words[:5]

In [None]:
# visualize top-5 pos and their LL values
top_pos_words[:5]

In [None]:
# function to generate a document based on likelihood values for words 
def get_scaled_list(log_list):
    counts = [int(x[1]*100000) for x in log_list]
    words = [x[0] for x in log_list]
    cloud = []
    for i, word in enumerate(words):
        cloud.extend([word]*counts[i])
    # shuffle to make it more "real"
    random.shuffle(cloud)
    return cloud

In [None]:
# Generate syntetic a corpus using our loglikelihood values 
neg_doc_final = get_scaled_list(top_neg_words)
neu_doc_final = get_scaled_list(top_neu_words)
pos_doc_final = get_scaled_list(top_pos_words)

# Smarter Word Clouds

As we can see here, the new word clouds are much better indicators of what words are REALLY characteristic of a particular sentiment. We see words that were almost invisble before! Overall, these clouds are much more informative. 

In [None]:
# visualise our synthetic corpus
generate_word_clouds(neg_doc_final, neu_doc_final, pos_doc_final)

# Summary

- Use VADER to generate sentiment scores and assign to Negative, Neutral and Positive classes 
- Generate naive word clouds using simple prepocessing methods 
- Peform more thorough preprocessing such as spell checks, splitting spaceless phrases etc. 
- Calculate loglikelihood values 
- Generate a Synthetic corpus based on the LL values 
- Generate Smarter Wordclouds