In [1]:
# Importing dataset
import pandas as pd
df = pd.read_csv("IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [3]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [4]:
# Checking for reviews that still have html tags left in them
df[df['review'].str.contains("<br /><br />", case=True)]

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49994,This is your typical junk comedy.<br /><br />T...,negative
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative


In [5]:
# Removing HTML tags left behind
from bs4 import BeautifulSoup
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

df['review'] = df['review'].apply(remove_html_tags)

  soup = BeautifulSoup(text, 'html.parser')


In [6]:
# Checking for html again
df[df['review'].str.contains("<br />", case=True)]

Unnamed: 0,review,sentiment


In [7]:
# Hence there is no html tags in the dataset, the dataset is ready for model building.

In [8]:
# Determining input(response variable) and outpu(predictor variable)
x = df['review']
y = df['sentiment']

In [9]:
# Splitting dataset into training and testing set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, random_state=69)
x_train.shape

(37500,)

In [10]:
# Tokenizing text data (creating count matrix with count of each word)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_train_counts.shape

(37500, 92447)

In [11]:
# Checking how many times a sample word occurs in the dataset
count_vect.vocabulary_.get(u'actor')

2166

In [12]:
# Normalizing the count matrix.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
x_train_tfidf = tfidf.fit_transform(x_train_counts)
x_train_tfidf.shape

(37500, 92447)

In [13]:
# Training a classifier (SVC)
from sklearn.svm import LinearSVC
lsvc = LinearSVC()
lsvc.fit(x_train_tfidf, y_train)

In [14]:
# Checking accuracy of SVC
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = tfidf.transform(x_test_counts)
lsvc.score(x_test_tfidf, y_test)*100

90.008

In [15]:
x_new = ["This movie is great"]
x_new_counts = count_vect.transform(x_new)
x_new_tfidf = tfidf.transform(x_new_counts)
lsvc.predict(x_new_tfidf)[0]

'positive'

In [16]:
x_new = ["This movie is bad"]
x_new_counts = count_vect.transform(x_new)
x_new_tfidf = tfidf.transform(x_new_counts)
lsvc.predict(x_new_tfidf)[0]

'negative'

In [17]:
predicted = lsvc.predict(x_test_tfidf)
predicted

array(['negative', 'positive', 'negative', ..., 'negative', 'negative',
       'positive'], dtype=object)

In [18]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted, target_names=df['sentiment'].unique()))

              precision    recall  f1-score   support

    positive       0.90      0.89      0.90      6182
    negative       0.90      0.91      0.90      6318

    accuracy                           0.90     12500
   macro avg       0.90      0.90      0.90     12500
weighted avg       0.90      0.90      0.90     12500



In [19]:
metrics.confusion_matrix(y_test, predicted)

array([[5516,  666],
       [ 583, 5735]], dtype=int64)

In [20]:
# Model gives good accuracy, now creating a pipeline
from sklearn.pipeline import Pipeline
review_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lsvc', LinearSVC()),
])

In [21]:
review_clf.fit(x_train, y_train)

In [22]:
test = ['''
I recently watched the movie "The Shawshank Redemption" and it was truly a cinematic masterpiece. The story was gripping and emotional, the 
characters were well-developed and acted with great depth, and the direction and cinematography were superb. I was on the edge of my seat 
throughout the entire film and was moved to tears by the powerful ending. This movie is a must-see for anyone who appreciates great storytelling 
and exceptional filmmaking.''']
review_clf.predict(test)[0]


'positive'

In [27]:
# Performing gridsearch for parameters
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'lsvc__penalty': ['l1', 'l2'], 
    'lsvc__loss': ['hinge', 'squared_hinge'], 
    'lsvc__tol': [0.0001, 0.001], 
    'lsvc__C': [0.1, 1, 10]
}

In [28]:
gs_review_clf = GridSearchCV(review_clf, parameters, scoring='accuracy', cv=5, n_jobs=-1)

In [29]:
gs_review_clf.fit(x_train, y_train)
gs_review_clf.best_score_*100

240 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sanya\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sanya\AppData\Roaming\Python\Python310\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\sanya\AppData\Roaming\Python\Python310\site-packages\sklearn\svm\_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "C:\Users\sanya\AppData\Roaming\Python\Python

91.41866666666667

In [31]:
for param_name in sorted(parameters.keys()):
    print("%s: %r"%(param_name, gs_review_clf.best_params_[param_name]))

lsvc__C: 10
lsvc__loss: 'hinge'
lsvc__penalty: 'l2'
lsvc__tol: 0.0001
tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [33]:
# We get minute accuracy boost using grid search

In [34]:
# Creating a file of the model for future use
import joblib
joblib.dump(gs_review_clf, "review_clf")

['review_clf']