In [88]:
#! Python3
# useful libs
import numpy as np
import operator
import re
import collections
import nltk
from difflib import ndiff
import seaborn as sns
import numpy.linalg as la
import six

# Preprocessing
from gensim.utils import lemmatize

# vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import normalize

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
import warnings
warnings.filterwarnings('ignore')

# (a) Data Import

In [89]:
sentances, scores = [], []


'\nfor file_name in [\'yelp_labelled\', \'amazon_cells_labelled\', \'imdb_labelled\']:    \n    for line in open(\'./data/{}.txt\'.format(file_name), \'r\', encoding=\'utf-8\'):\n        st, sc = line.split("\t")\n#         sentances.append(unicode(st, "utf-8"))\n        sentances.append(st)\n#         sentances.append(str(st.encode()))\n\n        scores.append(sc[0:-1])\n\n'

In [2]:
!gdown --id 1P8jSclCPfpFtTMdXzZy3D9DPTs9yOhb5

Downloading...
From: https://drive.google.com/uc?id=1P8jSclCPfpFtTMdXzZy3D9DPTs9yOhb5
To: /content/RTOnoLocWpol3.csv
100% 41.6M/41.6M [00:01<00:00, 37.2MB/s]


In [56]:
import pandas as pd
df = pd.read_csv("RTOnoLocWpol3.csv")

In [61]:
df.head()
#df = df[:100]

Unnamed: 0.1,Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,polarity
0,0,3798,48750,@Gotham2Face,2021-12-12 23:59:39+00:00,"@Gotham2Face Talia waits for Harvey’s return, ...",positive,0.0875
1,1,3799,48751,"Genève, Suisse",2021-12-12 23:56:07+00:00,The Obamas Return to Chicago https://t.co/uqvJ...,neutral,0.0
2,2,3800,48752,STATEWIDE BABY,2021-12-12 23:52:35+00:00,@mrosen23 @BrrrBailey @Puleo_Andrew @NYSPEF An...,positive,0.022487
3,3,3801,48753,"30.399941,-97.705223",2021-12-12 23:47:33+00:00,Being a creator is high esteem and realistic. ...,positive,0.108889
4,4,3802,48754,"30.399941,-97.705223",2021-12-12 23:47:33+00:00,The biggest driving force for office returns s...,positive,0.107143


In [60]:
df.dropna(inplace=True)

In [62]:
df.count()

Unnamed: 0       78
UserName         78
ScreenName       78
Location         78
TweetAt          78
OriginalTweet    78
Sentiment        78
polarity         78
dtype: int64

In [63]:
df.Sentiment =df.Sentiment.map(dict(positive=1, neutral=0 , negative=0))

In [90]:
sentances = list(df.OriginalTweet)

In [91]:
scores = list(df.Sentiment)

In [112]:
counter = collections.Counter(scores)
print(counter)

Counter({1: 41, 0: 37})


### Explanation
Yes the labels are balanced. 
By reading each line of the training txt files, we got lists of sentences and scores. 
Using collection.Counter, we get to know the number of each label in the score lists. 


# (b) Preprocessing

In [93]:
!pip install pattern



In [116]:


def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub("@","", text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text




In [117]:
sentences_processed = []
for i in sentances :
  sentences_processed.append(clean_text(i))

In [118]:
sentences_processed

[' talia waits for harvey’s return, bruce getting up from bed and going to his office where he spent plenty of evenings holed away. it always made the manor feel larger, somehow. like she was kept out of the way one end, far away from everyone else even if it wasn’t entirely true. —',
 'the obamas return to chicago  via obamafoundation',
 " brrrbailey puleo_andrew nyspef anything less than filing suit to stop return to office with a #delta #omicron combined surge about to slam ny (generally  weeks behind uk) is a half-measure. our members are catching #covid left and right while hq appears to be doing nothing. we'd love to be proven wrong.",
 'being a creator is high esteem and realistic. % of gen z report being a freelancer, and, they are in a life place to swap our corporate life for freelance. gen z will limit the ability for execs to mandate a return to office.',
 'the biggest driving force for office returns seems to be executives within certain demographics, but gen z is differen

### Preprocessing explanations
1) We should lowercase all of the words, because capitalized letters with make same words be treated as different ones.

2) We should strip punctuations because they do not contribute to sentiment.

3) Stop words are the most commonly occuring words which are not relevant in the context of the data and do not contribute any deeper meaning to the phrase. In this case contain no sentiment.

4) We should do lemmatization.This process finds the base or dictionary form of the word known as the lemma. This is done through the use of vocabulary (dictionary importance of words) and morphological analysis (word structure and grammar relations). This normalization is similar to stemming but takes into account the context of the word.

# (c) Split Training and Testing Set

In [95]:
from sklearn.model_selection import train_test_split

In [96]:
train_x, test_x, train_y, test_y = train_test_split(sentances, scores, test_size=0.33, random_state=42)

## (d) Bag of Words
Why should we vectorize training set first and then go through testing set?<br/>
1) Here we should vectorize the training set standalone because testing set could contain words that are not contained in training set. <br/>
2) We will vectorize testing set based on the feature vector generated by training set. 

In [97]:
train_vectorizer = CountVectorizer()
# d.1. build a dictionary of unique words for training set
train_x_bag = train_vectorizer.fit_transform(train_x).todense()
test_vectorizer = CountVectorizer(vocabulary=train_vectorizer.get_feature_names())
test_x_bag = test_vectorizer.fit_transform(test_x).todense()
# d.2. Report feature vectors of 2 reviews
print(train_x[10])
print(train_x_bag[10])
print(train_x[0])
print(train_x_bag[0])

“…He tested negative for Covid-19 upon his return to Johannesburg (from his trip to West Africa) on December 8, according to his office.” - @CNN
[[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [98]:
train_x_bag

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0]])

## (e) Postprocessing strategy
We choose L2 normalization as post-processing method, because:<br/>
1) L2 presents the inner product of a vector on itself, representing the length of the vector<br/>
2) The similarity between 2 vectors are calculated by their inner product, which is the format of L2<br/>
3) So L2 would be an ideal way to constrain the value range of each feature into (0, 1)

In [99]:
# post-processing
train_x_bag_normal = normalize(train_x_bag)
test_x_bag_normal = normalize(test_x_bag)

## (f) Sentiment prediction

In [100]:
def sentiment_prediction(Train_X, Train_Y, Test_X, Test_Y):
    # f.1 Logistic regression
    lr_clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(Train_X, Train_Y)
    lr_clf_score = lr_clf.score(Test_X, Test_Y)
    print("Logistic regression accuracy: {}".format(lr_clf_score))
    
    # f.2 Naive Bayes classifier
    # Gaussian
    gaussian_nb = GaussianNB()
    gaussian_nb.fit(Train_X, Train_Y)
    gaussian_nb_score = gaussian_nb.score(Test_X, Test_Y)
    print("Accuracy of Naive Bayes Classifier with Gaussian prior: {}".format(gaussian_nb_score))

    # Bernoulli
    b_nb = BernoulliNB()
    b_nb.fit(Train_X, Train_Y)
    b_nb_score = b_nb.score(Test_X, Test_Y)
    print("Accuracy of Naive Bayes Classifier with Bernoulli prior: {}".format(b_nb_score))
    
    return lr_clf_score, gaussian_nb_score, b_nb_score

In [101]:
sentiment_prediction(train_x_bag_normal, train_y, test_x_bag_normal, test_y)

Logistic regression accuracy: 0.6538461538461539
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.6538461538461539
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.5769230769230769


(0.6538461538461539, 0.6538461538461539, 0.5769230769230769)

### Comparison of classifiers:
Logistic regression model is slightly better than Naive Bayes classifiers. 

### Words playing the most important roles

In [102]:
vocabulary = train_vectorizer.vocabulary_
sorted_vocabulary = sorted(vocabulary.items(), key=operator.itemgetter(1), reverse=True)
print("The top 10 most important words: ")
for word in sorted_vocabulary[:10]:
    print("count({}) = {}".format(word[0], word[1]))

The top 10 most important words: 
count(zw) = 744
count(zvyvj6dzm5) = 743
count(zffwbblmw0) = 742
count(zero) = 741
count(zaichishka) = 740
count(yyesij0cxt) = 739
count(your) = 738
count(you) = 737
count(york) = 736
count(yet) = 735


## (g) N-gram model

In [103]:
# Vectorize with 2-gram model
train_vectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
# build a dictionary of unique words for training set
train_x_2gram = train_vectorizer_2gram.fit_transform(train_x).todense()
test_vectorizer_2gram = CountVectorizer(ngram_range=(2, 2), vocabulary=train_vectorizer_2gram.get_feature_names())
test_x_2gram = test_vectorizer_2gram.fit_transform(test_x).todense()
# Report feature vectors of 2 reviews
print(train_x[10])
print(train_x_2gram[10])
print(train_x[0])
print(train_x_2gram[0])

# post-processing
train_x_2gram_normal = normalize(train_x_2gram)
test_x_2gram_normal = normalize(test_x_2gram)

sentiment_prediction(train_x_2gram_normal, train_y, test_x_2gram_normal, test_y)
# # Logistic regression
# lr_clf_2gram = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(train_x_2gram_normal, train_y)
# lr_clf_2gram_score = lr_clf_2gram.score(test_x_2gram_normal, test_y)
# print("Logistic regression accuracy: {}".format(lr_clf_2gram_score))

# # Naive Bayes classifier
# # Gaussian
# gaussian_nb_2gram = GaussianNB()
# gaussian_nb_2gram.fit(train_x_2gram_normal, train_y)
# gaussian_nb_2gram_score = gaussian_nb_2gram.score(test_x_2gram_normal, test_y)
# print("Accuracy of Naive Bayes Classifier with Gaussian prior: {}".format(gaussian_nb_2gram_score))

# # Bernoulli
# b_nb_2gram = BernoulliNB()
# b_nb_2gram.fit(train_x_2gram_normal, train_y)
# b_nb_2gram_score = b_nb_2gram.score(test_x_2gram_normal, test_y)
# print("Accuracy of Naive Bayes Classifier with Bernoulli prior: {}".format(b_nb_2gram_score))

# Most important 2-gram words
vocabulary_2gram = train_vectorizer_2gram.vocabulary_
sorted_vocabulary_2gram = sorted(vocabulary_2gram.items(), key=operator.itemgetter(1), reverse=True)

important_words = []
for word in sorted_vocabulary_2gram[:10]:
    important_words.append(word[0])
    print("count({}) = {}".format(word[0], word[1]))

print("The top 10 most important 2-gram words: ")
print(important_words)

“…He tested negative for Covid-19 upon his return to Johannesburg (from his trip to West Africa) on December 8, according to his office.” - @CNN
[[0 0 0 ... 0 0 0]]
The End of a Return-to-Office Date https://t.co/QcckO8R0hh via @politicalwire
[[0 0 0 ... 0 0 0]]
Logistic regression accuracy: 0.6923076923076923
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.6153846153846154
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.5384615384615384
count(zw but) = 1408
count(zero front) = 1407
count(zaichishka got) = 1406
count(yyesij0cxt https) = 1405
count(your upcoming) = 1404
count(your mailbox) = 1403
count(you would) = 1402
count(you with) = 1401
count(you why) = 1400
count(you walk) = 1399
The top 10 most important 2-gram words: 
['zw but', 'zero front', 'zaichishka got', 'yyesij0cxt https', 'your upcoming', 'your mailbox', 'you would', 'you with', 'you why', 'you walk']


## (h) PCA for bag of words model

In [104]:
#Use SVD to peform PCA
p,n = np.shape(train_x_bag_normal)
cov_Mat = np.dot(train_x_bag_normal.T, train_x_bag_normal)/(p-1)
u, s, vh = np.linalg.svd(cov_Mat, full_matrices=True)

In [105]:
train_x_10 = np.dot(train_x_bag_normal, u[:,:10])
test_x_10 = np.dot(test_x_bag_normal, u[:,:10])
sentiment_prediction(train_x_10, train_y, test_x_10, test_y)

Logistic regression accuracy: 0.5769230769230769
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.6153846153846154
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.5


(0.5769230769230769, 0.6153846153846154, 0.5)

In [106]:
train_x_50 = np.dot(train_x_bag_normal, u[:,:50])
test_x_50 = np.dot(test_x_bag_normal, u[:,:50])
sentiment_prediction(train_x_50, train_y, test_x_50, test_y)

Logistic regression accuracy: 0.6538461538461539
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.6153846153846154
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.6153846153846154


(0.6538461538461539, 0.6153846153846154, 0.6153846153846154)

In [107]:
train_x_100 = np.dot(train_x_bag_normal, u[:,:100])
test_x_100 = np.dot(test_x_bag_normal, u[:,:100])
sentiment_prediction(train_x_100, train_y, test_x_100, test_y)

Logistic regression accuracy: 0.6538461538461539
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.5769230769230769
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.5384615384615384


(0.6538461538461539, 0.5769230769230769, 0.5384615384615384)

In [108]:
def meanX(dataX):
    return np.mean(dataX, axis=0)
def pca(XMat, k):
    average = meanX(XMat) 
    m, n = np.shape(XMat)
    data_adjust = []
    avgs = np.tile(average, (m, 1))
    data_adjust = XMat - avgs
    covX = np.cov(data_adjust.T)
    featValue, featVec=  np.linalg.eig(covX)
    index = np.argsort(-featValue)
    finalData = []
    if k > n:
        print("k must lower than feature number")
        return
    else:
        selectVec = np.matrix(featVec.T[index[:k]])
        finalData = data_adjust * selectVec.T 
        reconData = (finalData * selectVec) + average  
        finalData = finalData.astype('float64')
    return finalData, reconData

In [109]:
train_x_10, _recon_train = pca(train_x_bag_normal, 10)
test_x_10, _recon_test = pca(test_x_bag_normal, 10)
sentiment_prediction(train_x_10, train_y, test_x_10, test_y)

Logistic regression accuracy: 0.6538461538461539
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.5769230769230769
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.5384615384615384


(0.6538461538461539, 0.5769230769230769, 0.5384615384615384)

In [110]:
# PCA with 50 components

train_x_50, _recon_train = pca(train_x_bag_normal, 50)
test_x_50, _recon_test = pca(test_x_bag_normal, 50)
sentiment_prediction(train_x_50, train_y, test_x_50, test_y)

Logistic regression accuracy: 0.6923076923076923
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.5
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.46153846153846156


(0.6923076923076923, 0.5, 0.46153846153846156)

In [111]:
# PCA with 100 components
# pca_100 = PCA(n_components=100)

train_x_100, _recon_train = pca(train_x_bag_normal, 100)
test_x_100, _recon_test = pca(test_x_bag_normal, 100)
sentiment_prediction(train_x_100, train_y, test_x_100, test_y)

Logistic regression accuracy: 0.6923076923076923
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.5
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.5


(0.6923076923076923, 0.5, 0.5)

----------------------------