In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This is an analysis of covid vaccine tweets, as well as correlations between tweet sentiment and account size/engagement. An unsupervised clustering algorithm was used based off of this article
https://towardsdatascience.com/unsupervised-sentiment-analysis-a38bf1906483?gi=fcf9c329e93d

In [None]:
data = pd.read_csv('../input/pfizer-vaccine-tweets/vaccination_tweets.csv')
#data = pd.read_csv('../input/all-covid19-vaccines-tweets/vaccination_all_tweets.csv')
data = data.dropna().drop_duplicates()
data = data[data.text.str.len()>1]
data.head()

#Reading data and removing duplicates


In [None]:
data['text'] = data['text'].str.lower()
data['text'] = data['text'].str.replace("[^A-Za-z0-9^,!?.\/'+]", " ")
data['text'] = data['text'].str.replace(r"https?.*", " ")
data['text'] = data['text'].str.replace(r"\+", " plus ")
data['text']= data['text'].str.replace(r",", " ")
data['text']= data['text'].str.replace(r"\.", " ")
data['text'] = data['text'].str.replace(r"!", " ! ")
data['text'] = data['text'].str.replace(r"\?", " ? ")
data['text'] = data['text'].str.replace(r"'", " ")
data['text'] = data['text'].str.replace(r":", " : ")
data['text'] = data['text'].str.replace(r"\s{2,}", " ")

text = data['text']
text.head()
#Data cleaning

In [None]:
from gensim.models import Word2Vec
import multiprocessing
w2v_model = Word2Vec(min_count=3,window=4,
                     size=300)
print("done word2vec")
#Defining word2vec model

In [None]:
from gensim.models.phrases import Phrases, Phraser
sent = [row for row in data.text]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]
#converting tweets into bigrams for word2vec

In [None]:
sentences2 = []
for i in sentences:
    str(i)
    i = i.split(' ')
    sentences2.append(i)


for j in sentences2:
    for k in j:
        if(len(k) < 2):
            j.remove(k)
#splitting sentences by word to create word2vec vocabulary
    



In [None]:
w2v_model.build_vocab(sentences2, progress_per=50000)
print("done vocab")

In [None]:


w2v_model.train(sentences2, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Done Training')

w2v_model.init_sims(replace=True)



In [None]:
w2v_model.save("word2vec.model")

In [None]:


word_vectors = Word2Vec.load("word2vec.model").wv
print('loaded vectors')



In [None]:
from sklearn.cluster import KMeans


model = KMeans(n_clusters=2, max_iter=1000, random_state=False, n_init=50).fit(X=word_vectors.vectors.astype('double'))


#K means clustering model fit to word2vec model

In [None]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

In [None]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)
#These blocks show top 10 words in each cluster to identify the positive one

In [None]:


positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]



In [None]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])
#fits words to clusters

In [None]:


words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

#displays cluster and sentiment score of each word (closeness score multiplied by pos or neg 1)

In [None]:
words.head(10)

In [None]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [None]:
final_file = text

In [None]:
final_file.head()

In [None]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_map.head()

In [None]:
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))
#combine words with sentiment scores

In [None]:
file_weighting2 = final_file.copy()
file_weighting2 = file_weighting2.to_frame()
file_weighting2["weight"] = 1
file_weighting2 = file_weighting2.rename(columns={"text": "title", "weight": "rate"})
file_weighting2[['title', 'rate']].to_csv("cleaned_dataset.csv",index=False)
file_weighting = pd.read_csv("cleaned_dataset.csv")

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display



tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.title)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.title)
#prepare tfidf score for adjusted weighting
#TFIDF takes into account frequency, so the score for a common word like 'the' will be scaled down



In [None]:
def create_tfidf_dictionary(x, transformed_file, features):
#create TFIDF dictionary, method from https://towardsdatascience.com/unsupervised-sentiment-analysis-a38bf1906483
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.title.split()))

In [None]:
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)

In [None]:
def replace_sentiment_words(word, sentiment_dict):
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [None]:


replaced_closeness_scores = file_weighting.title.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))



In [None]:


replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')




In [None]:
!pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [None]:


def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    if score['pos']>score['neg']:
        return 1
    return 0

#calculate sentiment with VADER engine to compare to our clustering

In [None]:
replacement_df['vader'] = replacement_df.apply(lambda x: sentiment_analyzer_scores(x.loc['sentence']), axis=1)

In [None]:
replacement_df

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
predicted_classes = replacement_df.prediction
y_test = replacement_df.vader

conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.vader, replacement_df.prediction))
print('Confusion Matrix')
display(conf_matrix)

test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

print('\n \n Scores')
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)
#confusion matrix for comparison

In [None]:
predicted_classes.value_counts()
#sanity check for number of positive and negative in cluster algorithm + vader
#a large difference probably means something is wrong

In [None]:
y_test.value_counts()

In [None]:
preds = replacement_df["vader"]
data = data.reset_index()
data["sentiment"] = preds
data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


corrmatrix = data[['favorites', 'retweets','user_verified','user_followers','sentiment']].corr()
print(sns.heatmap(corrmatrix))
#heatmap of correlations between interesting columns

In [None]:
#the zero correlations in sentiment are a bit strange, so this is separating the positive and negative
#tweets for further analysis
data['date'] = pd.to_datetime(data['date'])
pos = data.loc[data['sentiment'] == 1]
neg = data.loc[data['sentiment'] == 0]
pos.head()

In [None]:
#Deeper correlation analysis: comparing tweet engagement and account size on positive vs negative tweets



In [None]:
#Comparing user followers
from scipy.stats import ttest_ind
def correlation_analysis(param):
    print("positive sentiment mean",np.mean(pos[param]))
    print("positive sentiment standard deviation",np.std(pos[param]))
    print("negative sentiment mean",np.mean(neg[param]))
    print("negative sentiment standard deviation",np.std(neg[param]))
    ttest,pval = ttest_ind(pos[param],neg[param])
    print("p-value",pval)
    plt.plot(pos[param],label="positive")
    plt.plot(neg[param],label="negative")
    plt.legend()
    plt.show()

In [None]:
correlation_analysis("user_followers")

In [None]:
correlation_analysis("user_friends")

In [None]:
correlation_analysis("retweets")

In [None]:
correlation_analysis("favorites")
#only significant p value here, positive tweets are significantly more likely to get more likes