In [1]:
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

In [3]:
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('movie.csv')

In [5]:
data

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [6]:
data.isnull().sum()

text     0
label    0
dtype: int64

In [7]:
#selecting some random data to reduce data size and save training time for practice purpose

data = data.sample(frac= 0.2, random_state=42) 

In [8]:
data.reset_index(inplace=True, drop=True)
data

Unnamed: 0,text,label
0,The central theme in this movie seems to be co...,0
1,"An excellent example of ""cowboy noir"", as it's...",1
2,The ending made my heart jump up into my throa...,0
3,Only the chosen ones will appreciate the quali...,1
4,"This is a really funny film, especially the se...",1
...,...,...
7995,Lisa Baumer (Ida Galli) is the adulteress wife...,1
7996,I let a friend talk me into viewing this movie...,0
7997,Wow. A truly fantastic 'trip' movie that has t...,1
7998,Skip Mission: Galactica and watch the original...,0


In [9]:
data['label'].value_counts()
#i.e. data is balanced

label
1    4034
0    3966
Name: count, dtype: int64

In [10]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [11]:
import re

In [12]:
data['text'][0]

"The central theme in this movie seems to be confusion, as the relationships, setting, acting and social context all lead to the same place: confusion. Even Harvey Keitel appears to be out of his element, and lacks his usual impeccable clarity, direction and intensity. To make matters worse, his character's name is 'Che', and we are only told (directly, by the narrator) well into the film that he is not 'that' Che, just a guy named Che. The family relationships remain unclear until the end of the film, and once defined, the family is divided - the younger generation off to America. So cliché. Other reviews discuss how the movie depicts the impact of the revolution on a boy's family; however the political stance of the director is murky at best, and we are never quite sure who is responsible for what bloodshed. So they lost their property (acquired by gambling profits) - so what? Refusing to take a political stand, when making a movie about the Cuban revolution, is an odd and cowardly c

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()




In [14]:
stopwords = stopwords.words("english")

# Words to retain for sentiment analysis
# List of negative stopwords in English
negative_stopwords = [
    "no", "not", "never", "none", "nobody", "nowhere",
    "neither", "nor", "without", "against", "barely",
    "hardly", "scarcely", "cannot", "won't", "don't",
    "doesn't", "isn't", "aren't", "wasn't", "weren't",
    "haven't", "hasn't", "hadn't", "wouldn't", "shouldn't",
    "couldn't", "can't", "didn't", "mustn't", "shan't",
    "mightn't", "needn't"
]

# Customize stop words by removing sentiment-relevant words
custom_stopwords = set(stopwords) - set(negative_stopwords)

custom_stopwords


{'a',
 'about',
 'above',
 'after',
 'again',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 'd',
 'did',
 'didn',
 'do',
 'does',
 'doesn',
 'doing',
 'don',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 'has',
 'hasn',
 'have',
 'haven',
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 'more',
 'most',
 'mustn',
 'my',
 'myself',
 'needn',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 'she',
 "she's",
 'should',
 "should've",
 'shouldn',
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',


In [15]:
def preprocess_review(review):

    cleaned_sentence = re.sub('[^a-zA-Z]', ' ', review).lower()
    words = cleaned_sentence.split()
    filtered_tokens = [lemmatizer.lemmatize(word) for word in words if word not in custom_stopwords]
    cleaned_review = ' '.join(filtered_tokens)
    return cleaned_review


In [16]:
data['text'] = data['text'].fillna('').astype(str)

# Apply the function to the 'review' column
data['review'] = data['text'].apply(preprocess_review)

data

Unnamed: 0,text,label,review
0,The central theme in this movie seems to be co...,0,central theme movie seems confusion relationsh...
1,"An excellent example of ""cowboy noir"", as it's...",1,excellent example cowboy noir called unemploye...
2,The ending made my heart jump up into my throa...,0,ending made heart jump throat proceeded leave ...
3,Only the chosen ones will appreciate the quali...,1,chosen one appreciate quality story character ...
4,"This is a really funny film, especially the se...",1,really funny film especially second third four...
...,...,...,...
7995,Lisa Baumer (Ida Galli) is the adulteress wife...,1,lisa baumer ida galli adulteress wife big busi...
7996,I let a friend talk me into viewing this movie...,0,let friend talk viewing movie say want kill fr...
7997,Wow. A truly fantastic 'trip' movie that has t...,1,wow truly fantastic trip movie ton super surre...
7998,Skip Mission: Galactica and watch the original...,0,skip mission galactica watch original living l...


In [17]:
%pip install wordcloud
from wordcloud import WordCloud

Note: you may need to restart the kernel to use updated packages.


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
cv = CountVectorizer(max_features=2000)

In [20]:
x = cv.fit_transform(data['review']).toarray()

In [21]:
x.shape

(8000, 2000)

In [22]:
y = data['label']

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)


In [25]:
from sklearn.metrics import accuracy_score

In [26]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train, y_train)
y_pred = nb.predict(x_test)
accuracy_score(y_test, y_pred)

0.77

In [27]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)
y_pred = lr.predict(x_test)
accuracy_score(y_test, y_pred)

0.8275

In [28]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
accuracy_score(y_test, y_pred)

0.814375