In [None]:
import pandas as pd
import numpy as np

# Try to read the CSV file, handling bad lines by skipping them
try:
    df = pd.read_csv("IMDB Dataset.csv", on_bad_lines='skip', engine='python')
except pd.errors.ParserError as e:
    # If ParserError is encountered, print the error message and the problematic row
    print(f"ParserError: {e}")
    # Attempt to read the problematic row to provide more context
    try:
        with open("IMDB Dataset.csv", 'r') as file:
            for i, line in enumerate(file):
                if i == 18199:  # Adjust row number based on error message
                    print(f"Problematic row ({i + 1}): {line}")
                    break
    except Exception as e:
        print(f"Error reading problematic row: {e}")

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.shape

(11038, 2)

In [None]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [None]:
# text cleaning
df = df.sample(10000)

In [None]:
df.shape

(10000, 2)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 7671 to 3515
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [None]:
df['sentiment'].replace({'positive': 1,'negative':0},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sentiment'].replace({'positive': 1,'negative':0},inplace=True)
  df['sentiment'].replace({'positive': 1,'negative':0},inplace=True)


In [None]:
df.head()

Unnamed: 0,review,sentiment
7671,The timing of this film being released could n...,1
7481,i say the domino principle is an enormously un...,1
495,"""American Nightmare"" is officially tied, in my...",0
1557,This movie is one of the most provocative Jesu...,1
9988,For a Norris movie this is pretty tame. For an...,0


In [None]:
import re

In [None]:
clean = re.compile('<.*?>')
re.sub(clean,'',df.iloc[2].review)

'"American Nightmare" is officially tied, in my opinion, with "It\'s Pat!" for the WORST MOVIE OF ALL TIME.Seven friends (oddly resembling the K-Mart version of the cast of "Friends") gather in a coffee shop to listen to American Nightmare, a pirate radio show. It\'s hosted by a guy with a beard. That\'s the most exciting aspect of his show.Chandler, Monica, Joey, and... oh wait, I mean, Wayne, Jessie, and the rest of the bad one-liner spouting gang all take turns revealing their biggest fears to the bearded DJ. Unbeknownst to them, a crazed nurse/serial killer is listening...Crazy Nurse then proceeds to torture Ross and Rachel and... wait, sorry again... by making their fears come to life. These fears include such stunners as "voodoo" and being gone down on by old ladies with dentures.No. Really.This movie was, in a word, rotten. Crazy Nurse\'s killing spree lacks motivation, there\'s nothing to make the viewer "jump," the ending blows, and--again--voodoo?If you have absolutely no reg

In [None]:
# function to clean html tag
def clean_html(text):
  clean = re.compile('<.*?>,....')
  return re.sub(clean,'',text)


In [None]:
df['review'] = df['review'].apply(clean_html)

In [None]:
# converting to lower case
def convert_lower(text):
  return text.lower()

In [None]:
df['review'] = df['review'].apply(convert_lower)

In [None]:
df['review'][0]

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the fa

In [None]:
# remove special characters
def remove_special(text):
  import re
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
  return text

  # x = ''

  # for i in text:
  #   if i.isalnum():
  #     x= x+i
  #   else:
  #     x = x + ' '
  # return x


In [None]:
remove_special('one of the other reviewers has mentioned that after watching just 1 oz episode you\'ll be hooked. they are right, as this is exactly what happened with me.the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.i would say the main appeal of the show is due to the fact that it goes where other shows wouldn\'t dare. forget pretty pictures painted for mainstream audiences, forget charm, forget romance...oz doesn\'t mess around. the first episode i ever saw struck me as so nasty it was surreal, i couldn\'t say i was ready for it, but as i watched more, i developed a taste for oz, and got accustomed to the high levels of graphic violence. not just violence, but injustice (crooked guards who\'ll be sold out for a nickel, inmates who\'ll kill on order and get away with it, well mannered, middle class inmates being turned into prison bitches due to their lack of street skills or prison experience) watching oz, you may become comfortable with what is uncomfortable viewing....thats if you can get in touch with your darker side.')

'one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with methe first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the wordit is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayi would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget pretty pictur

In [None]:
df['review'] = df['review'].apply(remove_special)

In [None]:
df['review'].iloc[1]

'i say the domino principle is an enormously underappreciated filmanyone who has taken the time to investigate our contemporary history of conspiraciesjfk rfk mlkgwallace and in fact numerous others can only draw the conclusion that the author of the domino principle really knew what he was talking aboutroy tucker could be lee harvey oswald or james earl ray or sirhan sirhan or arthur bremer maybe even john hinkley or timothy mcveighto mention a fewthe conspiracy scenario involving spies big business and political assassinations is not really a fiction but an ominous part of our convoluted existential historygod help usbut the domino principle is more fact than fantasyif this causes a little loss of sleep maybe it shoulddont take my word for itinvestigate for yourselves'

In [None]:
# remove stopwords

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

In [None]:
df

Unnamed: 0,review,sentiment
7671,the timing of this film being released could n...,1
7481,i say the domino principle is an enormously un...,1
495,american nightmare is officially tied in my op...,0
1557,this movie is one of the most provocative jesu...,1
9988,for a norris movie this is pretty tame for an ...,0
...,...,...
1828,this movie should have never been madebr br wh...,0
119,i greatly enjoyed margaret atwoods novel the r...,0
9343,i am a big fan of deepa mehtas work especially...,0
9674,whats wrong with this film many many things th...,0


In [None]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [None]:
def remove_stopwords(text):
  x=[]
  for i in text.split():
    if i not in stopwords.words('english'):
      x.append(i)

  y = x[:]
  x.clear()
  return y

In [None]:
df['review'] = df['review'].apply(remove_stopwords)

In [37]:
df

Unnamed: 0,review,sentiment
7671,"[timing, film, released, could, better, partic...",1
7481,"[say, domino, principle, enormously, underappr...",1
495,"[american, nightmare, officially, tied, opinio...",0
1557,"[movie, one, provocative, jesus, movies, ever,...",1
9988,"[norris, movie, pretty, tame, action, movie, k...",0
...,...,...
1828,"[movie, never, madebr, br, shame, budgetbr, br...",0
119,"[greatly, enjoyed, margaret, atwoods, novel, r...",0
9343,"[big, fan, deepa, mehtas, work, especially, fi...",0
9674,"[whats, wrong, film, many, many, things, editi...",0


In [38]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [39]:
y = []
def stem_words(text):
  for i in text:
    y.append(ps.stem(i))

  z = y[:]
  y.clear()
  return z


In [40]:
df['review'] = df['review'].apply(stem_words)

In [44]:
df

Unnamed: 0,review,sentiment
7671,time film releas could better particularli lig...,1
7481,say domino principl enorm underappreci filmany...,1
495,american nightmar offici tie opinion pat worst...,0
1557,movi one provoc jesu movi ever seen seek tell ...,1
9988,norri movi pretti tame action movi kind dull f...,0
...,...,...
1828,movi never madebr br shame budgetbr br pleas h...,0
119,greatli enjoy margaret atwood novel robber bri...,0
9343,big fan deepa mehta work especi fire earth 194...,0
9674,what wrong film mani mani thing edit tri hard ...,0


In [41]:
def join_back(list_input):
  return " ".join(list_input)

In [42]:
df['review'] = df['review'].apply(join_back)

In [43]:
df

Unnamed: 0,review,sentiment
7671,time film releas could better particularli lig...,1
7481,say domino principl enorm underappreci filmany...,1
495,american nightmar offici tie opinion pat worst...,0
1557,movi one provoc jesu movi ever seen seek tell ...,1
9988,norri movi pretti tame action movi kind dull f...,0
...,...,...
1828,movi never madebr br shame budgetbr br pleas h...,0
119,greatli enjoy margaret atwood novel robber bri...,0
9343,big fan deepa mehta work especi fire earth 194...,0
9674,what wrong film mani mani thing edit tri hard ...,0


In [46]:
df['review']

Unnamed: 0,review
7671,time film releas could better particularli lig...
7481,say domino principl enorm underappreci filmany...
495,american nightmar offici tie opinion pat worst...
1557,movi one provoc jesu movi ever seen seek tell ...
9988,norri movi pretti tame action movi kind dull f...
...,...
1828,movi never madebr br shame budgetbr br pleas h...
119,greatli enjoy margaret atwood novel robber bri...
9343,big fan deepa mehta work especi fire earth 194...
9674,what wrong film mani mani thing edit tri hard ...


In [47]:
X = df.iloc[:,0:1].values

In [48]:
X.shape

(10000, 1)

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000)

In [81]:
X = cv.fit_transform(df['review']).toarray()

In [82]:
X.shape

(10000, 1000)

In [83]:
X[1000].max()

np.int64(3)

In [84]:
y = df.iloc[:,-1].values

In [85]:
y

array([1, 1, 0, ..., 0, 0, 1])

In [86]:
from sklearn.model_selection import train_test_split

In [87]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [88]:
X_train.shape

(8000, 1000)

In [89]:
X_test.shape

(2000, 1000)

In [90]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [91]:
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

In [92]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

In [93]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)

In [94]:
y_pred1.shape

(2000,)

In [95]:
y_pred2.shape

(2000,)

In [96]:
from sklearn.metrics import accuracy_score

In [97]:
print("Gaussian",accuracy_score(y_test,y_pred1))
print("Multinomial",accuracy_score(y_test,y_pred2))
print("Bernaulli",accuracy_score(y_test,y_pred3))

Gaussian 0.7945
Multinomial 0.827
Bernaulli 0.825
