In [None]:
# Import Libraries
import pandas as pd
import spacy

In [None]:
# Read the dataset
df = pd.read_csv('imdb_labelled.txt', sep = '\t', header = None)
df.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
columns_name = ['Review', 'Sentiment']

df.columns = columns_name

In [None]:
df.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
# Cleaning data
def clean_text(df):
  nlp = spacy.load('en')
  for i in range(df.shape[0]):
    doc = nlp(df['Review'][i])
    text = [w.lemma_.lower().strip() for w in doc
            if not (w.is_stop | w.is_punct | w.is_digit)]
    text = " ".join(text)

    if i<5:print('Sentence',i,text)
    df['Review'][i] = text

    return df

In [None]:
imdb_df = clean_text(df)

Sentence 0 slow aimless movie distress drift young man


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [None]:
imdb_df

Unnamed: 0,Review,Sentiment
0,slow aimless movie distress drift young man,0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
743,I just got bored watching Jessice Lange take h...,0
744,"Unfortunately, any virtue in this film's produ...",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()

X = tf.fit_transform(imdb_df['Review'])
y = imdb_df['Sentiment']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 64)

In [None]:
# Naive Bayes
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()
nb.fit(X_train,y_train)

BernoulliNB()

In [None]:
y_pred = nb.predict(X_test)

In [None]:
y_pred

array([0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0])

In [None]:
from sklearn.metrics import classification_report
print("NB Classification report: \n" ,classification_report(y_test,y_pred))

NB Classification report: 
               precision    recall  f1-score   support

           0       0.75      0.64      0.69        70
           1       0.72      0.81      0.76        80

    accuracy                           0.73       150
   macro avg       0.74      0.73      0.73       150
weighted avg       0.74      0.73      0.73       150



In [None]:
#Linear SVC
from sklearn.svm import LinearSVC
svc = LinearSVC()
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
print("SVC Classification report: \n" ,classification_report(y_test,y_pred))

SVC Classification report: 
               precision    recall  f1-score   support

           0       0.76      0.74      0.75        70
           1       0.78      0.80      0.79        80

    accuracy                           0.77       150
   macro avg       0.77      0.77      0.77       150
weighted avg       0.77      0.77      0.77       150

