In [1]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Sai\AppData\Roaming\nltk_data...


True

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [9]:
a = 'This is a very good movie'
b = 'Bad mood leads to depression'
sid.polarity_scores(a)
sid.polarity_scores(b)

{'neg': 0.706, 'neu': 0.294, 'pos': 0.0, 'compound': -0.802}

In [10]:
import pandas as pd
df = pd.read_csv('amazonreviews.tsv', sep = '\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [11]:
df.shape

(10000, 2)

In [12]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [14]:
sid.polarity_scores(df.loc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [15]:
df['scores'] = df['review'].apply(lambda review:sid.polarity_scores(review))

In [16]:
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [18]:
df['compound'] = df['scores'].apply(lambda score_dict:score_dict['compound'])

In [19]:
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [21]:
df['comp_score'] = df['compound'].apply(lambda c:'pos' if c>=0 else 'neg')

In [22]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [23]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [26]:
accuracy_score(df['label'],df['comp_score'])

0.7097

In [27]:
confusion_matrix(df['label'],df['comp_score'])

array([[2629, 2468],
       [ 435, 4468]], dtype=int64)

Classification model for Amazon Reviews

In [28]:
# Import Libraries
import pandas as pd
import spacy

In [29]:
# Read the dataset
df = pd.read_csv('amazonreviews.tsv', sep = '\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [40]:
# Cleaning data
def clean_text(df):
  nlp = spacy.load('en_core_web_sm')
  for i in range(df.shape[0]):
    doc = nlp(df['review'][i])
    text = [w.lemma_.lower().strip() for w in doc
            if not (w.is_stop | w.is_punct | w.is_digit)]
    text = " ".join(text)
    df['review'][i] = text

    return df

In [41]:
clean_df = clean_text(df)

In [42]:
clean_df

Unnamed: 0,label,review
0,pos,stun non gamer sound track beautiful paint sen...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."
...,...,...
9995,pos,A revelation of life in small town America in ...
9996,pos,Great biography of a very interesting journali...
9997,neg,Interesting Subject; Poor Presentation: You'd ...
9998,neg,Don't buy: The box looked used and it is obvio...


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()

X = tf.fit_transform(clean_df['review'])
y = clean_df['label']

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 64)

In [45]:
# Naive Bayes
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()
nb.fit(X_train,y_train)

BernoulliNB()

In [47]:
y_pred = nb.predict(X_test)

In [48]:
y_pred

array(['neg', 'pos', 'neg', ..., 'neg', 'pos', 'pos'], dtype='<U3')

In [49]:
from sklearn.metrics import classification_report
print("NB Classification report: \n" ,classification_report(y_test,y_pred))

NB Classification report: 
               precision    recall  f1-score   support

         neg       0.86      0.87      0.86      1024
         pos       0.86      0.85      0.85       976

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000



In [50]:
#Linear SVC
from sklearn.svm import LinearSVC
svc = LinearSVC()
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
print("SVC Classification report: \n" ,classification_report(y_test,y_pred))

SVC Classification report: 
               precision    recall  f1-score   support

         neg       0.88      0.87      0.87      1024
         pos       0.86      0.87      0.87       976

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000

