# Dataset: https://www.kaggle.com/nltkdata/movie-review

### Import the library 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

### Import Data

In [2]:
data = pd.read_csv('movie_review.csv')

In [3]:
data.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


In [4]:
X = data.iloc[:,-2]

In [5]:
y = data.iloc[:,-1]

In [6]:
X

0        films adapted from comic books have had plenty...
1        for starters , it was created by alan moore ( ...
2        to say moore and campbell thoroughly researche...
3        the book ( or " graphic novel , " if you will ...
4        in other words , don't dismiss this film becau...
                               ...                        
64715    that lack of inspiration can be traced back to...
64716    like too many of the skits on the current inca...
64717    after watching one of the " roxbury " skits on...
64718     bump unsuspecting women , and . . . that's all .
64719    after watching _a_night_at_the_roxbury_ , you'...
Name: text, Length: 64720, dtype: object

## CountVectorizer

In [7]:
# from sklearn.feature_extraction.text import CountVectorizer
# corpus = ['This is the first document.',
#             'This document is the second document.',
#             'And this is the third one.',
#             'Is this the first document?']
# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names())
# print(X.toarray())

## Train test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

### TfidfVectorizer documentation 
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

### Working With Text Data tutorial 
https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [9]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# corpus = ['This is the first document.',
#             'This document is the second document.',
#             'And this is the third one.',
#             'Is this the first document?']

# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names())

# print(X.toarray())

# LinearSVC model

In [10]:
from sklearn.svm import LinearSVC
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC()),
])
text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
print(accuracy_score(y_pred,y_test))


0.6983158220024722


In [11]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         neg       0.69      0.69      0.69      6336
         pos       0.70      0.71      0.71      6608

    accuracy                           0.70     12944
   macro avg       0.70      0.70      0.70     12944
weighted avg       0.70      0.70      0.70     12944



In [12]:
print(confusion_matrix(y_pred,y_test))

[[4343 1993]
 [1912 4696]]


# Naive bayes model

In [13]:
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])
text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
print(accuracy_score(y_pred,y_test))

0.7074320148331273


In [14]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         neg       0.68      0.70      0.69      6080
         pos       0.73      0.71      0.72      6864

    accuracy                           0.71     12944
   macro avg       0.71      0.71      0.71     12944
weighted avg       0.71      0.71      0.71     12944



In [15]:
print(confusion_matrix(y_pred,y_test))

[[4274 1806]
 [1981 4883]]


## KNN Model

In [16]:
from sklearn.neighbors import KNeighborsClassifier
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', KNeighborsClassifier()),
])
text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
print(accuracy_score(y_pred,y_test))

0.5089616810877626


In [17]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         neg       0.63      0.49      0.56      8035
         pos       0.39      0.53      0.45      4909

    accuracy                           0.51     12944
   macro avg       0.51      0.51      0.50     12944
weighted avg       0.54      0.51      0.52     12944



In [18]:
print(confusion_matrix(y_pred,y_test))

[[3967 4068]
 [2288 2621]]


## XGBoost

In [19]:
import xgboost as xgb
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', xgb.XGBClassifier()),
])
text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
print(accuracy_score(y_pred,y_test))



0.6269313967861557


In [20]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         neg       0.59      0.62      0.61      6016
         pos       0.66      0.63      0.65      6928

    accuracy                           0.63     12944
   macro avg       0.63      0.63      0.63     12944
weighted avg       0.63      0.63      0.63     12944



In [21]:
print(confusion_matrix(y_pred,y_test))

[[3721 2295]
 [2534 4394]]


# Random forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier()),
])
text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
print(accuracy_score(y_pred,y_test))

0.6355840543881335


In [23]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         neg       0.58      0.63      0.61      5774
         pos       0.68      0.64      0.66      7170

    accuracy                           0.64     12944
   macro avg       0.63      0.64      0.63     12944
weighted avg       0.64      0.64      0.64     12944



In [24]:
print(confusion_matrix(y_pred,y_test))

[[3656 2118]
 [2599 4571]]
