<a href="https://colab.research.google.com/github/saidrishya/NLP/blob/main/moviereview_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('/content/moviereviews.tsv', sep = '\t')
data.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
from sklearn.feature_extraction import text
print(text.ENGLISH_STOP_WORDS)

frozenset({'less', 'interest', 'hence', 'yours', 'some', 'could', 'one', 'etc', 'no', 'whoever', 'anything', 'move', 'not', 'mostly', 'hereafter', 'hers', 'indeed', 'whereby', 'across', 'throughout', 'two', 'former', 'within', 'against', 'his', 'somehow', 'even', 'only', 'hasnt', 'de', 'none', 'couldnt', 'several', 'me', 'towards', 'thereafter', 'serious', 'ours', 'our', 'then', 'detail', 'amongst', 'ten', 'fifteen', 'wherever', 'these', 'noone', 'still', 'next', 'themselves', 'yourself', 'although', 'have', 'seemed', 'found', 'however', 'during', 'whose', 'others', 'done', 'each', 'a', 'after', 'wherein', 'much', 'un', 'until', 'her', 'show', 'becoming', 'own', 'put', 'five', 'or', 'you', 'often', 'alone', 'to', 'than', 'least', 'too', 'latterly', 'please', 'except', 'anywhere', 'do', 'cant', 'most', 'would', 'eleven', 'else', 'around', 'she', 'being', 'system', 'must', 'if', 'there', 'might', 'him', 'due', 'up', 'thence', 'yourselves', 'amount', 'when', 'bill', 'behind', 'per', 'alon

In [4]:
#some of the stopwords can actually be useful in the context of movie reviews
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']
             

In [5]:
len(data)

2000

In [6]:
print(data['review'][1])

some talented actresses are blessed with a demonstrated wide acting range while others , almost as gifted , have more limited types of parts for which they are suitable . 
as was amply evident after basic instinct , sharon stone can play sensual roles with great abandon . 
rejecting her natural abilities , she has spent the rest of her entire career trying with little success to play against type . 
gloria is her latest disaster . 
babe ruth didn't quit baseball after one season to play football in a quixotic quest to prove his athletic dexterity , and neither should stone reject what she does best . 
janeane garofalo , for example , is no less wonderful an actress because she could have never pulled off stone's part in basic instinct ; neither is stone any less talented because she couldn't do garofalo's comedic roles . 
gloria , directed by respected director sidney lumet and adapted by steve antin from the 1980 screenplay by john cassavetes , was not screened in advance for cr

In [7]:
data.isnull().sum()

label      0
review    35
dtype: int64

In [8]:
data.dropna(inplace=True)

In [9]:
data.isnull().sum()

label     0
review    0
dtype: int64

In [10]:
#at times the reviews can be empty strings or blank spaces
s1 = 'hi how are y9u'
s2 = ' '

s1.isspace()

False

In [11]:
s2.isspace()

True

In [12]:
#remove all reviews with empty spaces
blanks = []  

for index,label,review in data.itertuples(): 
    if type(review)==str:            
        if review.isspace():         
            blanks.append(index)     

In [13]:
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [14]:
data.drop(blanks, inplace=True)

In [15]:
len(data)

1938

In [16]:
data['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [17]:
# split the data into train and test
from sklearn.model_selection import train_test_split

X = data['review']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
#build pipeline to vectorize data, train and fit the model
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC()),
])

In [22]:
text_clf_nb.fit(X_train, y_train)
y_pred_nb = text_clf_nb.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred_nb)
ac = accuracy_score(y_test, y_pred_nb)
classificationReport = classification_report(y_test, y_pred_nb)

print(cm)
print()
print('Accuracy:' + str(ac))
print()
print(classificationReport)

[[282  26]
 [105 227]]

Accuracy:0.7953125

              precision    recall  f1-score   support

         neg       0.73      0.92      0.81       308
         pos       0.90      0.68      0.78       332

    accuracy                           0.80       640
   macro avg       0.81      0.80      0.79       640
weighted avg       0.82      0.80      0.79       640



In [24]:
# now for SVC
text_clf_lsvc.fit(X_train, y_train)
y_pred_lsvc = text_clf_lsvc.predict(X_test)

In [25]:
cm = confusion_matrix(y_test, y_pred_lsvc)
ac = accuracy_score(y_test, y_pred_lsvc)
classificationReport = classification_report(y_test, y_pred_lsvc)

print(cm)
print()
print('Accuracy:' + str(ac))
print()
print(classificationReport)

[[256  52]
 [ 48 284]]

Accuracy:0.84375

              precision    recall  f1-score   support

         neg       0.84      0.83      0.84       308
         pos       0.85      0.86      0.85       332

    accuracy                           0.84       640
   macro avg       0.84      0.84      0.84       640
weighted avg       0.84      0.84      0.84       640

