In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('movies_sentiment_data.csv')

In [3]:
df

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive
...,...,...
18995,- Bad Stuff: This movie is real crap. Bad stun...,negative
18996,"If you've seen the trailer for this movie, you...",positive
18997,This has to be the all time best computer anim...,positive
18998,I've seen 'NSNA' just after I've seen all Roge...,positive


In [4]:
df.sentiment.value_counts()

positive    9500
negative    9500
Name: sentiment, dtype: int64

In [5]:
df['positive'] = df.sentiment.apply(lambda x : 1 if x== 'positive' else 0)
df

Unnamed: 0,review,sentiment,positive
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive,1
1,I enjoyed the movie and the story immensely! I...,positive,1
2,I had a hard time sitting through this. Every ...,negative,0
3,It's hard to imagine that anyone could find th...,negative,0
4,This is one military drama I like a lot! Tom B...,positive,1
...,...,...,...
18995,- Bad Stuff: This movie is real crap. Bad stun...,negative,0
18996,"If you've seen the trailer for this movie, you...",positive,1
18997,This has to be the all time best computer anim...,positive,1
18998,I've seen 'NSNA' just after I've seen all Roge...,positive,1


In [6]:
df.positive.value_counts()

1    9500
0    9500
Name: positive, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(df.review, df.positive, test_size=0.2)

In [9]:
X_train

18717    Well, for starters, this actually was THE most...
672      If you've read the original novel, as I did, y...
7296     A slasher flick, made in the early 80's, has a...
4932     I don't know why, but when I am asked about ba...
17335    Believe me, I like horror movies. I like scien...
                               ...                        
14033    It is difficult to imagine how the engaging Da...
649      SPOILERS<br /><br />*<br /><br />*<br /><br />...
4780     How is it possible that a movie this bad can b...
16367    This film, which is based on a true story, com...
16936    I'm no horror movie buff, but my wife's nieces...
Name: review, Length: 15200, dtype: object

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
from sklearn.pipeline import Pipeline

In [12]:
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [17]:
clf_rf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=50, criterion='entropy'))
])

In [18]:
clf_rf.fit(X_train, y_train)

In [16]:
from sklearn.metrics import classification_report

y_pred_rf = clf.predict(X_test)

print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83      1912
           1       0.83      0.81      0.82      1888

    accuracy                           0.82      3800
   macro avg       0.83      0.82      0.82      3800
weighted avg       0.83      0.82      0.82      3800



In [19]:
clf_kn = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])

clf_kn.fit(X_train, y_train)

In [21]:
y_pred_kn = clf_kn.predict(X_test)

print(classification_report(y_test, y_pred_kn))

              precision    recall  f1-score   support

           0       0.64      0.63      0.63      1912
           1       0.63      0.64      0.64      1888

    accuracy                           0.64      3800
   macro avg       0.64      0.64      0.64      3800
weighted avg       0.64      0.64      0.64      3800



In [22]:
clf_mn = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

clf_mn.fit(X_train, y_train)

In [23]:
y_pred_mn = clf_mn.predict(X_test)

print(classification_report(y_test, y_pred_mn))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85      1912
           1       0.86      0.81      0.83      1888

    accuracy                           0.84      3800
   macro avg       0.84      0.84      0.84      3800
weighted avg       0.84      0.84      0.84      3800

