In [1]:
import pandas as pd

In [2]:
dataset=pd.read_csv(r"C:\Users\wwrao\OneDrive\Documents\IMDB Dataset.csv")
dataset=dataset.iloc[:1000,:]

In [3]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
import re
dataset["review"] = dataset["review"].apply(lambda x: re.sub(r"<.*?>", "", x))

In [5]:
dataset["review"]

0      One of the other reviewers has mentioned that ...
1      A wonderful little production. The filming tec...
2      I thought this was a wonderful way to spend ti...
3      Basically there's a family where a little boy ...
4      Petter Mattei's "Love in the Time of Money" is...
                             ...                        
995    Nothing is sacred. Just ask Ernie Fosselius. T...
996    I hated it. I hate self-aware pretentious inan...
997    I usually try to be professional and construct...
998    If you like me is going to see this in a film ...
999    This is like a zoology textbook, given that it...
Name: review, Length: 1000, dtype: object

In [6]:
def remove_url(text):
    plain_text=re.sub(r'http[s]?://\S+', '', text)
    return plain_text

In [7]:
dataset["review"]=dataset["review"].apply(remove_url)

In [8]:
dataset.duplicated().sum()

0

In [9]:
dataset.drop_duplicates(inplace=True)

In [10]:
dataset["review"]=dataset["review"].str.lower()
dataset.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [11]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stop words
def remove_stopwords(text):
    words = text.split()
    x = [word for word in words if word.lower() not in stop_words]
    return " ".join(x)

In [12]:
dataset["review"]=dataset["review"].apply(remove_stopwords)

In [13]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [14]:
story=[]
for doc in dataset["review"] :
    raw_sent=sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [15]:
story

[['one', 'reviewers', 'mentioned', 'watching', 'oz', 'episode', 'hooked'],
 ['right',
  'exactly',
  'happened',
  'me',
  'the',
  'first',
  'thing',
  'struck',
  'oz',
  'brutality',
  'unflinching',
  'scenes',
  'violence',
  'set',
  'right',
  'word',
  'go'],
 ['trust', 'me', 'show', 'faint', 'hearted', 'timid'],
 ['show', 'pulls', 'punches', 'regards', 'drugs', 'sex', 'violence'],
 ['hardcore',
  'classic',
  'use',
  'word',
  'it',
  'called',
  'oz',
  'nickname',
  'given',
  'oswald',
  'maximum',
  'security',
  'state',
  'penitentary'],
 ['focuses',
  'mainly',
  'emerald',
  'city',
  'experimental',
  'section',
  'prison',
  'cells',
  'glass',
  'fronts',
  'face',
  'inwards',
  'privacy',
  'high',
  'agenda'],
 ['em',
  'city',
  'home',
  'many',
  'aryans',
  'muslims',
  'gangstas',
  'latinos',
  'christians',
  'italians',
  'irish',
  'more',
  'so',
  'scuffles',
  'death',
  'stares',
  'dodgy',
  'dealings',
  'shady',
  'agreements',
  'never',
  'far

In [16]:
import gensim

In [17]:
model=gensim.models.Word2Vec(window=10,min_count=2)   # min_count mean that a sentence which is consist of one word are taken otherwwise are ignored

In [18]:
model.build_vocab(story,progress_per=1000)

In [19]:
model.train(story,total_examples=model.corpus_count,epochs=model.epochs)

(557130, 623475)

In [20]:
model.wv.most_similar("bad")

[('good', 0.9996881484985352),
 ('acting', 0.999672532081604),
 ('really', 0.9996580481529236),
 ('even', 0.9996177554130554),
 ('movie', 0.9995967149734497),
 ('plot', 0.9995822310447693),
 ('watch', 0.9995567202568054),
 ('think', 0.9995469450950623),
 ('see', 0.9995426535606384),
 ('guy', 0.9995352625846863)]

In [21]:
model.wv.similarity(w1="bad",w2="book")

0.9993508

In [22]:
model.corpus_count

10377

In [23]:
model.wv.most_similar("help")

[('scene', 0.9999008774757385),
 ('also', 0.999896228313446),
 ('young', 0.9998958706855774),
 ('in', 0.9998878836631775),
 ('love', 0.9998875260353088),
 ('up', 0.9998859763145447),
 ('comes', 0.9998844265937805),
 ('without', 0.9998839497566223),
 ('beautiful', 0.9998834133148193),
 ('war', 0.9998834133148193)]

In [24]:
model.wv.similarity(w1="news",w2="book")

0.9981014

In [25]:
import numpy as np
def document_vector(doc):
    doc=[word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc],axis=0)

In [26]:


len(dataset["review"].apply(document_vector))


1000

In [28]:
from tqdm import tqdm

In [29]:
X=[]
for doc in tqdm(dataset["review"].values):
    X.append(document_vector(doc))

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:18<00:00, 55.05it/s]


In [30]:
X

[array([-0.20040315,  0.34758174,  0.16814277,  0.05572372,  0.00928914,
        -0.6531723 ,  0.13053323,  0.88319176, -0.23394188, -0.29310873,
        -0.12409635, -0.55215764, -0.08042108,  0.19732451,  0.02521991,
        -0.39693832, -0.0366601 , -0.46538678,  0.07828826, -0.8745797 ,
         0.25887308,  0.17656232,  0.21029851, -0.24631436,  0.00535308,
        -0.03317526, -0.20041129, -0.19074225, -0.3378752 , -0.06302961,
         0.32907593,  0.13300629,  0.03133677, -0.09940218, -0.12479153,
         0.40121138,  0.16968808, -0.3676336 , -0.17319061, -0.74631083,
         0.10017795, -0.42497954, -0.07412157, -0.01932468,  0.36439002,
        -0.18023278, -0.23186399,  0.02048154,  0.17161883,  0.30119246,
         0.10142455, -0.3518326 , -0.16595829, -0.12538362, -0.2631969 ,
         0.17001331,  0.32036096, -0.06002278, -0.28863665,  0.18299958,
         0.11935827,  0.25645533, -0.01281935, -0.04598246, -0.47305438,
         0.33135006,  0.0517855 ,  0.25674722, -0.5

In [31]:
x=np.array(X)

In [32]:
x

array([[-0.20040315,  0.34758174,  0.16814277, ..., -0.50855297,
         0.0341068 ,  0.12665822],
       [-0.23557658,  0.41960657,  0.20513646, ..., -0.6082801 ,
         0.04107264,  0.15269583],
       [-0.24285932,  0.4249004 ,  0.20762946, ..., -0.6189963 ,
         0.04163428,  0.15515567],
       ...,
       [-0.21432313,  0.3752544 ,  0.18462129, ..., -0.5488686 ,
         0.03816192,  0.13954242],
       [-0.28490233,  0.49663728,  0.2416515 , ..., -0.728268  ,
         0.04859219,  0.18172795],
       [-0.20126794,  0.35565743,  0.17281726, ..., -0.51684815,
         0.03468817,  0.1282286 ]], dtype=float32)

In [33]:
y=dataset.sentiment

In [34]:
from sklearn.preprocessing import LabelEncoder
l=LabelEncoder()

In [35]:
y=l.fit_transform(y)

In [36]:
y

array([1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,

In [81]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=43
                                              )

In [82]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((800, 100), (200, 100), (800,), (200,))

In [83]:
from sklearn.naive_bayes import GaussianNB

In [84]:
g=GaussianNB()

In [85]:
g.fit(x_train,y_train)

In [86]:
y_predict=g.predict(x_test)
y_predict

array([1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1])

**Random forest Classifier**

In [87]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)

0.47

In [88]:
from sklearn.ensemble import RandomForestClassifier
r=RandomForestClassifier()

In [89]:
r.fit(x_train,y_train)

In [90]:
y_pred=r.predict(x_test)
y_pred

array([1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0])

In [91]:
accuracy_score(y_test,y_pred)

0.64