In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pandas as pd
import numpy as np

In [2]:
#Needed for tfidf command below
def dummy(doc):
    return doc

In [3]:
#Get the dataframe of tweets
df = pd.read_csv('data\\tweets_all.csv')

#Split into three df's based on topic
df_politics = df.loc[df['topic'].isin(["Sinn Fein", "Qanon", "Varadkar"])]
df_TV = df.loc[df['topic'].isin(["Eastenders", "Tommy Tiernan", "Eoghan McDermott"])]
df_others = df.loc[df['topic'].isin(["Pancakes", "Burren", "Daniel Kinahan", "Shamrock Rovers"])]


In [4]:
#Split into train and test data
#Use politics tweets to train the model and then use on TV tweets
#Result is only 0.348 accuracy
# 0.33 for bigram
# 0.34 for trigram
x_train = df_politics['clean_text']
x_test = df_TV['clean_text']
y_train = df_politics['sentiment']
y_test = df_TV['sentiment']

In [7]:
tfidf = TfidfVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy, token_pattern=None, ngram_range=(1,3))

x_train1 = tfidf.fit_transform(x_train)
x_test1 = tfidf.transform(x_test)


nb_clf =  MultinomialNB(alpha=0.005)

nb_clf.fit(x_train1, y_train)

print(nb_clf.score(x_test1, y_test))

y_pred = nb_clf.predict(x_test1)

print(classification_report(y_test, y_pred))
print(pd.crosstab(y_test, y_pred))

0.4573643410852713
              precision    recall  f1-score   support

    Negative       0.67      0.66      0.67        85
     Neutral       0.20      0.03      0.05        36
    Positive       0.05      0.25      0.08         8

    accuracy                           0.46       129
   macro avg       0.31      0.31      0.27       129
weighted avg       0.50      0.46      0.46       129

col_0      Negative  Neutral  Positive
sentiment                             
Negative         56        4        25
Neutral          21        1        14
Positive          6        0         2


In [6]:
#Split into train and test data
#Use TV tweets to train the model and then use on politics tweets
#Result is only 0.24 accuracy
# 0.325 for bigrams
# 0.43 for trigrams
x_train = df_TV['clean_text']
x_test = df_politics['clean_text']
y_train = df_TV['sentiment']
y_test = df_politics['sentiment']

In [41]:
#Split into train and test data
#Use TV tweets to train the model and then use on others tweets
#Result is 0.57 accuracy
# 0.47 for bigrams
# 0.48 for trigrams
x_train = df_TV['clean_text']
x_test = df_others['clean_text']
y_train = df_TV['sentiment']
y_test = df_others['sentiment']

In [57]:
#Split into train and test data
#Use others tweets to train the model and then use on TV tweets
#Result is 0.55 accuracy
# 0.39 for bigrams
# 0.39 for trigrams
x_train = df_others['clean_text']
x_test = df_TV['clean_text']
y_train = df_others['sentiment']
y_test = df_TV['sentiment']

In [49]:
#Split into train and test data
#Use others tweets to train the model and then use on politics tweets
#Result is only 0.326 accuracy
# 0.42 for bigrams
# 0.52 for trigrams
x_train = df_others['clean_text']
x_test = df_politics['clean_text']
y_train = df_others['sentiment']
y_test = df_politics['sentiment']

In [59]:
#Split into train and test data
#Use politics tweets to train the model and then use on others tweets
#Result is only 0.39 accuracy
# 0.39 for bigrams
# 0.38 for trigrams
x_train = df_politics['clean_text']
x_test = df_others['clean_text']
y_train = df_politics['sentiment']
y_test = df_others['sentiment']