In [1]:
import numpy as np
import pandas as pd

In [2]:
temp_df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df = temp_df.iloc[:10000]

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [8]:
import re
def remove_tags(raw_text):
  cleaned_text = re.sub(re.compile('<.*?>'),'', raw_text)
  return cleaned_text

In [7]:
df['review'] = df['review'].apply(remove_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(remove_tags)


In [9]:
df['review'] = df['review'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x: x.lower())


In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [12]:
from nltk.corpus import stopwords
sw_list = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x: " ".join(x))


In [13]:
import gensim

In [14]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [16]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [17]:
story = []
for doc in df['review']:
  raw_sent = sent_tokenize(doc)
  for sent in raw_sent:
    story.append(simple_preprocess(sent))

In [18]:
model = gensim.models.Word2Vec(window=10, min_count=2)

In [19]:
model.build_vocab(story)

In [20]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(5875896, 6212140)

In [21]:
len(model.wv.index_to_key)   # total words in vocab

31845

In [25]:
def document_vector(doc):
  # remove out-of-vocabulary words
  doc = [word for word in doc.split() if word in model.wv.index_to_key]
  return np.mean(model.wv[doc], axis=0)

In [26]:
document_vector(df['review'].values[0])
# convert 1st review in vector of 100 dimensions

array([-0.2379893 ,  0.4338816 ,  0.22417445,  0.25619134, -0.08507776,
       -0.5815775 ,  0.1974153 ,  0.8946034 , -0.39490527, -0.22979541,
       -0.3063465 , -0.4612605 ,  0.10660863,  0.14616768,  0.18133056,
       -0.1362885 ,  0.07748238, -0.34641415, -0.03947238, -0.6500031 ,
        0.02220613,  0.2806205 ,  0.02538467, -0.34317842, -0.30119455,
        0.02498942, -0.28213495,  0.07517683, -0.34365812,  0.04027006,
        0.24571615,  0.00469132,  0.2776    , -0.20712236, -0.13530244,
        0.4915043 ,  0.20317292, -0.38500184, -0.1903562 , -0.7483354 ,
        0.13517699, -0.23213364,  0.01773529, -0.09527181,  0.4836387 ,
       -0.12152288, -0.23239633, -0.07098094,  0.05542238,  0.315101  ,
        0.05500396, -0.29372206, -0.45864797, -0.0871393 , -0.09745124,
        0.2086715 ,  0.17548922,  0.09905635, -0.31253245,  0.11516961,
        0.01348075,  0.13156597, -0.02084738, -0.02268986, -0.41403544,
        0.31254548, -0.02195752,  0.15637702, -0.32466078,  0.33

In [27]:
from tqdm import tqdm

In [28]:
X = []
for doc in tqdm(df['review'].values):
  X.append(document_vector(doc))
  # all reviews converting to vectors, calculating mean

100%|██████████| 9983/9983 [10:36<00:00, 15.69it/s]


In [29]:
X = np.array(X)
# convert X to numpy array

In [30]:
X.shape

(9983, 100)

In [31]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])

In [32]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [35]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.7686529794692039

To improve accuracy more:
* can use googles pre-trained model.
* here we did not use complete data, use of complete data can also increase accuracy.