In [131]:
# Data Preparation : Combine positive and negative reviews and process the review text (remove stopwords, punctuations, URL links and stemming)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import nltk

In [2]:
X_train_pos = pd.read_csv("./all_positive_reviews_train.txt")


In [3]:
X_test_pos = pd.read_csv("./all_positive_reviews_test.txt")
X_train_neg = pd.read_csv("./all_negative_reviews_train.txt")
X_test_neg = pd.read_csv("./all_negative_reviews_test.txt")

In [4]:
X_train_pos.shape


(12499, 3)

In [5]:
X_train_pos_np = X_train_pos.to_numpy()


In [6]:
X_train_neg_np = X_train_neg.to_numpy()


In [7]:
X_train = np.concatenate((X_train_pos_np, X_train_neg_np), axis=0)

In [8]:
X_train.shape

(24998, 3)

In [9]:
np.random.shuffle(X_train)

In [10]:
y_train = X_train[:,2].astype(float)

In [11]:
y_train

array([1., 1., 1., ..., 1., 0., 1.])

In [12]:
X_train = X_train[:,:2]

In [13]:
y_train.shape

(24998,)

In [14]:
import string
import nltk
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
from nltk.stem.porter import PorterStemmer
import re

stopwords = nltk.corpus.stopwords.words('english')
stemmer = PorterStemmer()
tk = WhitespaceTokenizer()
def process_review(review):
    mod_review = []
    pat = re.compile(r"http\S+|\\S+|\d+|\<\S+|\/|\>|\!|\,|\.")
    review = re.sub(pat, "", review)
    review = re.sub(r' +', ' ', review)
    for word in word_tokenize(review):
        word = word.lower()
        if word not in string.punctuation and word not in stopwords:
            stem_word = stemmer.stem(word)
            mod_review.append(stem_word)
    return ' '.join(mod_review)
            

In [15]:
mod_reviews = [ process_review(review) for review in X_train[:,1] ]

In [126]:
# The above model to convert the text to arrays does not work correctly. Creates overflow issue. 
# So switching to TF-IDF vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

In [127]:
X_train_tfidf = tfidf.fit_transform(mod_reviews)

In [128]:
X_train_tfidf.shape

(24998, 64621)

In [129]:
tfidf.get_feature_names_out()

array(['____', '_____', '______', ..., 'østbye', 'über', 'üvegtigri'],
      dtype=object)

In [130]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X_train_tfidf, y_train, test_size=0.2)

In [131]:
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((19998, 64621), (5000, 64621), (19998,), (5000,))

In [132]:
X_tr[0]

<1x64621 sparse matrix of type '<class 'numpy.float64'>'
	with 34 stored elements in Compressed Sparse Row format>

In [133]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression()

In [134]:
clf_lr.fit(X_tr, y_tr)

In [135]:
y_pred = clf_lr.predict(X_val)

In [136]:
from sklearn.metrics import classification_report, confusion_matrix

In [137]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.86      0.88      2533
         1.0       0.86      0.91      0.89      2467

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



In [138]:
print(confusion_matrix(y_val, y_pred))

[[2181  352]
 [ 220 2247]]


In [152]:
type(mod_reviews)

list

In [154]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [156]:
score_dict = analyzer.polarity_scores(mod_reviews[0])

In [157]:
print(score_dict)

{'neg': 0.229, 'neu': 0.655, 'pos': 0.117, 'compound': -0.9114}


In [165]:
threshold = 1.0
print("Rating was {} % positive".format(score_dict['pos'] * 100 ))
print("Rating was {} % negative".format(score_dict['neg'] * 100))
print("Rating was {} % neutral". format(score_dict['neu'] * 100))
print("Overall score is {} ".format(score_dict['compound'] * 100))
final_sentiment = 'Positive' if score_dict['compound'] > threshold else 'Negative'
f = round(score_dict['compound'], 2)
print(f)

Rating was 11.700000000000001 % positive
Rating was 22.900000000000002 % negative
Rating was 65.5 % neutral
Overall score is -91.14 
-0.91


In [162]:
print("Final sentiment: {}".format(final_sentiment))

Final sentiment Negative


In [175]:
# using VADER lexicon for sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def get_sentiment_scores(review, threshold, debug = False):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(review)
    final_sentiment = 'Positive' if score['compound'] > threshold else 'Negative'

    if debug:
        print("Rating was {} % positive".format(score['pos'] * 100 ))
        print("Rating was {} % negative".format(score['neg'] * 100))
        print("Rating was {} % neutral". format(score['neu'] * 100))
        print("Overall score is {} . Final Sentiment {}\n".format(score['compound'], final_sentiment))
    
    return final_sentiment


In [197]:
TP = 0.0
TN = 0.0
FP = 0.0
FN = 0.0
UN = 0.0
udict = {}
for review, actual_sentiment in zip(mod_reviews, y_train):
    predicted_sentiment = get_sentiment_scores(review, 0.0, False)

    actual_sentiment = 'Positive' if actual_sentiment > 0.0 else 'Negative'
    if actual_sentiment == 'Positive' and predicted_sentiment == 'Positive':
        TP += 1
    elif actual_sentiment == 'Positive' and predicted_sentiment == 'Negative':
        FN += 1
    elif actual_sentiment == 'Negative' and predicted_sentiment == 'Negative':
        TN += 1
    elif actual_sentiment == 'Negative' and predicted_sentiment == 'Positive':
        FP += 1
    else:
        UN += 1
        udict[review] = [actual_sentiment, predicted_sentiment]


In [199]:
print("True Positive : ", TP)
print("True Negative : ", TN)
print("False Positive : ", FP)
print("False Negative : ", FN)
print("Unknown : ", UN)

True Positive :  10273.0
True Negative :  6025.0
False Positive :  6474.0
False Negative :  2226.0
Unknown :  0.0


In [200]:
TP + TN + FP + FN

24998.0

In [203]:
accuracy = TP/ (TP + TN)
precision = TP / (TP + FP)
recall = TN / (TN + FN)

F1_score = 2 *(precision * recall)/ (precision + recall)

In [204]:
print("accuracy {} ,precision {}, recall {}, f1_score {}\n".format(accuracy, precision, recall, F1_score))

accuracy 0.6303227389863787 ,precision 0.6134232996954678, recall 0.7302145194521876, f1_score 0.6667430666577229

