In [1]:
import numpy as np
import pandas as pd
import re
import string

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
sw = stopwords.words('english')

In [3]:
rev = pd.read_csv('IMDB Dataset.csv')

In [4]:
rev.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
rev

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [7]:
rev['sentiment'] = rev['sentiment'].map({'positive':1, 'negative':0})

In [8]:
rev.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
rev

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [10]:
rev = rev[:10000]

In [11]:
rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


In [12]:
rev['review'] = rev['review'].apply(lambda x:x.lower())

In [13]:
rev['review'] = rev['review'].apply(lambda x:re.sub(r"@\S+","", x)) #handling spaces

In [14]:
#here first line,remove punctuatins, can be done thrgh regex, but trying different approach
rev['review'] = rev['review'].apply(lambda x : " ".join([word for word in x.split() if word not in string.punctuation]))
#here removng stopwords
rev['review'] = rev['review'].apply(lambda x : " ".join([word for word in x.split() if word not in sw]))

In [15]:
#here we are not converting to base words, as each sentence is less

In [16]:
x_train, x_test, y_train, y_test = train_test_split(rev.review, rev.sentiment, train_size=0.7, random_state=30)

In [17]:
#test datframe
#test_dataframe = pd.DataFrame({'review':p})

In [18]:
#Due to the problem of “Data leakage”  the best approach is to split the data into training and testing and then apply the scaling/ vectorisation method
tf = TfidfVectorizer()
tf_x_train = tf.fit_transform(x_train) #here converting text to numbers

In [19]:
tf_x_train

<7000x45207 sparse matrix of type '<class 'numpy.float64'>'
	with 726183 stored elements in Compressed Sparse Row format>

In [20]:
tf_x_test = tf.transform(x_test)

In [21]:
tf_x_test

<3000x45207 sparse matrix of type '<class 'numpy.float64'>'
	with 303533 stored elements in Compressed Sparse Row format>

# svm

In [22]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)

In [23]:
#fitting training data into model

In [24]:
clf.fit(tf_x_train, y_train)

LinearSVC(random_state=0)

In [25]:
y_test_pred = clf.predict(tf_x_test)

In [26]:
from sklearn.metrics import classification_report
report = classification_report(y_test,y_test_pred, output_dict=True)

In [27]:
report

{'0': {'precision': 0.8705722070844687,
  'recall': 0.8741450068399452,
  'f1-score': 0.8723549488054608,
  'support': 1462},
 '1': {'precision': 0.8798955613577023,
  'recall': 0.8764629388816645,
  'f1-score': 0.8781758957654722,
  'support': 1538},
 'accuracy': 0.8753333333333333,
 'macro avg': {'precision': 0.8752338842210855,
  'recall': 0.8753039728608049,
  'f1-score': 0.8752654222854666,
  'support': 3000},
 'weighted avg': {'precision': 0.8753519800418799,
  'recall': 0.8753333333333333,
  'f1-score': 0.8753391542802933,
  'support': 3000}}

# Gaussian

In [28]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [29]:
model_nb = GaussianNB()

In [31]:
model_nb.fit(tf_x_train.toarray(), y_train)

GaussianNB()

In [33]:
y_test_pred_gausn = model_nb.predict(tf_x_test.toarray())

In [34]:
report = classification_report(y_test,y_test_pred_gausn, output_dict=True)

In [35]:
report

{'0': {'precision': 0.6045918367346939,
  'recall': 0.6484268125854993,
  'f1-score': 0.6257425742574256,
  'support': 1462},
 '1': {'precision': 0.6410614525139665,
  'recall': 0.5968790637191157,
  'f1-score': 0.6181818181818182,
  'support': 1538},
 'accuracy': 0.622,
 'macro avg': {'precision': 0.6228266446243302,
  'recall': 0.6226529381523075,
  'f1-score': 0.6219621962196219,
  'support': 3000},
 'weighted avg': {'precision': 0.6232885930908677,
  'recall': 0.622,
  'f1-score': 0.6218664266426641,
  'support': 3000}}

In [36]:
#Here we see svm gives better accuracy than Gaussian

# Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

model_lr =LogisticRegression()
model_lr.fit(tf_x_train, y_train)
y_predict_lr = model_lr.predict(tf_x_test)
report_lr =classification_report(y_test, y_predict_lr, output_dict=True)

In [38]:
report_lr

{'0': {'precision': 0.8771686328938237,
  'recall': 0.8645690834473324,
  'f1-score': 0.8708232862555976,
  'support': 1462},
 '1': {'precision': 0.8729955099422707,
  'recall': 0.8849154746423927,
  'f1-score': 0.8789150791088151,
  'support': 1538},
 'accuracy': 0.875,
 'macro avg': {'precision': 0.8750820714180472,
  'recall': 0.8747422790448626,
  'f1-score': 0.8748691826822064,
  'support': 3000},
 'weighted avg': {'precision': 0.875029211860661,
  'recall': 0.875,
  'f1-score': 0.8749716787250137,
  'support': 3000}}

In [39]:
#todo
#hyperparmeters finetuning

#grid-search cvr