In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


Stop words are words that dont give distinct meaning in any way and dont provide any meaningful and important context in any way.

In [2]:
stop_words = [
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", 
    "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 
    'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
    'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 
    'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 
    'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 
    'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 
    'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 
    'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 
    'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 
    'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 
    'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 
    'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 
    'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', 
    "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', 
    "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
    "weren't", 'won', "won't", 'wouldn', "wouldn't"
]

In [3]:
df=pd.read_csv("IMDB_Dataset.csv",header=0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


I am using regex for trimming the word , removing any non word character and removing numbers , removing single character as an individual word because they were probably an error, removing multiple consecutive spaces and replacing them with a single space

In [6]:
def clean_text(text):
    text=text.lower()
    text=re.sub("\W"," ",text)
    text=re.sub(r"[0-9]"," ",text)
    text=re.sub(r"\s[a-z]\s"," ",text)
    text=re.sub(r"\s[a-z]$"," ",text)
    text=re.sub(r"^[a-z]\s"," ",text)
    text=re.sub(r"[\s]+"," ",text)

    return " ".join([word for word in text.split(" ") if not(word in stop_words)])

Review is cleaned now and stored in "clean_review"

In [7]:
df["clean_review"]=df["review"].apply(clean_text)

A tfidf vector is used to represent sentences in a vector format where each value tells you how important the corresponding word is in the document in a collection of documents , it considers the frequency of the word in that specific document in a collection of documents . Here tfidf vectorizer is only told to capture 500 most important words at max, DOCUMENT here refers to a particular review

Here Tfidf vectorizer converts X data into data with unique vocabulary words each as a column

In [8]:
vectorizer=TfidfVectorizer(max_features=500,stop_words="english",min_df=1)
X=vectorizer.fit_transform(df["clean_review"])
le=LabelEncoder()
y=le.fit_transform(df["sentiment"])


In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)

Logistic regression model is used for classification,here logistic regression model is trained on train dataset of  tfidf vectors of each review along with the sentiments provided , the sentiments were text earlier but I label encoded them so the data input is integer

In [10]:
model=LogisticRegression(random_state=42)
model.fit(X_train,y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


High accuracy score obtained , this model can be trusted

In [11]:
y_pred=model.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8385


0->Negative
1->Positive

In [12]:
list(le.classes_)

['negative', 'positive']

This imp_df is the features extracted along with their importance score in this model for deciding sentiment, descended , positive score means that the particular word leads to 1 more than 0 , 0->Negative sentiment , 1->Positive sentiment,so positive score leads to positive review and negative score means that the particular word leads to 0 more than 1 which means negative score leads to negative reviews more

In [13]:
imp_df=pd.Series(model.coef_[0],index=vectorizer.get_feature_names_out()).sort_values(ascending=False)

In [19]:
print(imp_df.head(20))
print("The above words tend to appear in positive reviews")

excellent     6.194152
great         5.498796
amazing       4.876764
wonderful     4.802846
perfect       4.618886
brilliant     4.476077
highly        4.189385
best          4.100167
favorite      3.844210
loved         3.602534
hilarious     3.570070
enjoyed       3.480391
today         3.377526
definitely    3.316021
enjoyable     3.134903
strong        3.015187
beautiful     2.957801
simple        2.910021
fun           2.907236
love          2.800790
dtype: float64
The above words tend to appear in positive reviews


In [20]:
print(imp_df.tail(20))
print("The above words tend to appear in negative reviews")

crap            -3.264856
script          -3.438154
save            -3.481307
attempt         -3.492431
stupid          -3.555869
minutes         -3.742372
instead         -3.813590
ridiculous      -4.198734
supposed        -4.219140
unfortunately   -4.277638
annoying        -4.575503
horrible        -4.809898
worse           -4.935904
poor            -5.405125
bad             -5.726435
terrible        -6.104481
boring          -6.628400
awful           -7.581103
waste           -8.058528
worst           -9.784058
dtype: float64
The above words tend to appear in negative reviews


In [21]:
print("Total reviews: ",len(df),"Positive reviews: ",len(df[df["sentiment"]=="positive"]),"Negative reviews: ",len(df[df["sentiment"]=="negative"]))

Total reviews:  50000 Positive reviews:  25000 Negative reviews:  25000
