In [27]:
import pandas as pd
import numpy as np

### 1. Loading data

In [28]:
path = "C:\\Users\\shubh\\Desktop\\Important_IPYNB_files\\NLP\Other_files\\IMDB_Dataset.csv"
df = pd.read_csv(path)
df = df[:10000]

### 2. Basic analysis

In [29]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [30]:
# Checking class balance
df['sentiment'].value_counts()

positive    5028
negative    4972
Name: sentiment, dtype: int64

In [31]:
# NULL values
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [32]:
# Duplicate values
df.duplicated().sum()

17

In [33]:
df.drop_duplicates(inplace=True)

In [34]:
df.duplicated().sum()

0

### 3. Basic preprocessing

In [35]:
# Remove HTML tags
import re

def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

df['review'] = df['review'].apply(remove_tags)

In [36]:
# Lower casing
df['review'] = df['review'].apply(lambda x:x.lower())

In [37]:
# Remove stopwords
from nltk.corpus import stopwords

sw_list = stopwords.words('english')

df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [38]:
df['review'][1]

'wonderful little production. filming technique unassuming- old-time-bbc fashion gives comforting, sometimes discomforting, sense realism entire piece. actors extremely well chosen- michael sheen "has got polari" voices pat too! truly see seamless editing guided references williams\' diary entries, well worth watching terrificly written performed piece. masterful production one great master\'s comedy life. realism really comes home little things: fantasy guard which, rather use traditional \'dream\' techniques remains solid disappears. plays knowledge senses, particularly scenes concerning orton halliwell sets (particularly flat halliwell\'s murals decorating every surface) terribly well done.'

In [39]:
# Seperating X(input data) and y(class variable)
X = df.iloc[:,0:1]
y = df['sentiment']

In [40]:
# Converting y(class variable) into 0 & 1
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [41]:
y

array([1, 1, 1, ..., 0, 0, 1])

### 4. Train-Test split

In [42]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [43]:
print('Train data:', X_train.shape)
print('Test data:', X_test.shape)

Train data: (7986, 1)
Test data: (1997, 1)


### 5. Text Vectorization(Feature Engineering)

##### 5.1. BOW & Naive Bayes

In [44]:
# Applying BoW
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

print('Train data:', X_train_bow.shape)
print('Test data:', X_test_bow.shape)

Train data: (7986, 48282)
Test data: (1997, 48282)


In [46]:
# Applying Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix

gnb = GaussianNB()
gnb.fit(X_train_bow,y_train)
y_pred = gnb.predict(X_test_bow)

print(accuracy_score(y_test,y_pred))
confusion_matrix(y_test,y_pred)

0.6324486730095142


array([[717, 235],
       [499, 546]], dtype=int64)

##### 5.2. BOW & Random Forest

In [47]:
# Applying Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
print(accuracy_score(y_test,y_pred))

0.8507761642463696


In [48]:
# BOW: Most frequently used 3000 features(words in our case)
cv = CountVectorizer(max_features=3000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8372558838257386

##### 5.3. N-grams & Random Forest

In [49]:
cv = CountVectorizer(ngram_range=(1,2),max_features=5000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8462694041061593

##### 5.4. TF-IDF & Random Forest

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review'])

rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test,y_pred)

0.8437656484727091

##### 5.5. Custom W2V(on our own data)

In [58]:
path = "C:\\Users\\shubh\\Desktop\\Important_IPYNB_files\\NLP\Other_files\\IMDB_Dataset.csv"
df = pd.read_csv(path)

In [60]:
# Drop duplicates
df.drop_duplicates(inplace=True)

In [62]:
# Remove HTML tags
import re

def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

df['review'] = df['review'].apply(remove_tags)

In [63]:
# Lower casing
df['review'] = df['review'].apply(lambda x:x.lower())

In [64]:
# Remove stopwords
from nltk.corpus import stopwords

sw_list = stopwords.words('english')

df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [65]:
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

story = []

# Sentence Tokenization
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

# Initializing W2V model
model = gensim.models.Word2Vec(
    window=10,         # window=10 means ---> 10 words on either side of our focus/target word.
    min_count=2,       # picking sentences with atleast 2 words
    workers=4          # 4 cores parallely
)

# building vocabulary from the list 'story'
model.build_vocab(story)

# Training w2v on our own data
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(29540314, 30891345)

In [70]:
# Length of Vocabulary
print('Length of Vocabulary:', len(model.wv.index_to_key))

Length of Vocabulary: 61843


In [71]:
type(model.wv.index_to_key)

list

##### 5.6. Average W2V(on our own data)

In [75]:
# Converting entire document(review) into a numerical vector
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

# This is the first review
document_vector(df['review'].values[0])

array([ 0.10501339,  0.30747497,  0.17245911, -0.13408558,  0.25844842,
       -0.02025337, -0.20083608,  0.61111915,  0.40674952, -0.15839246,
        0.1229958 , -0.3683737 ,  0.44359496,  0.06434723, -0.07597051,
        0.0394401 , -0.15392599, -0.0322172 , -0.10481744, -0.16380973,
       -0.24960642,  0.2344628 , -0.01237789,  0.12628049, -0.03454323,
       -0.0473893 ,  0.26033342,  0.12324991, -0.37490442, -0.18179458,
        0.3657113 ,  0.05142675,  0.19240575, -0.3283031 , -0.3687984 ,
        0.23345576, -0.10905088, -0.11924558,  0.07118194, -0.5039484 ,
       -0.07678928, -0.26330823, -0.2491042 , -0.32948092,  0.4081078 ,
       -0.11376476, -0.26003194, -0.05677945,  0.23286341,  0.15037318,
        0.29430714, -0.04465853, -0.21874185, -0.47279727, -0.14038189,
        0.16437168,  0.1698023 , -0.178364  , -0.11313047, -0.07902962,
        0.24557208,  0.07740561, -0.2806842 ,  0.09889531, -0.03227866,
       -0.02838442, -0.41858244, -0.33597028, -0.21707219, -0.27

In [77]:
# Performing the above transformation for entire corpus

df = df[:10000]
from tqdm import tqdm

X = []

for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [19:27<00:00,  8.57it/s]


In [98]:
print(X)
print(type(X))
print(len(X))

[[ 0.10501339  0.30747497  0.17245911 ... -0.08127119  0.06999487
   0.2105496 ]
 [ 0.19889541  0.15217632  0.05717173 ...  0.3055233  -0.1265673
   0.13415943]
 [ 0.24245839  0.18394384 -0.1782204  ... -0.29714575  0.05394321
   0.09020282]
 ...
 [ 0.536764    0.17784327  0.32414502 ...  0.07521009  0.16796497
   0.20907971]
 [-0.07166822  0.42849728  0.00441418 ...  0.04540765  0.09674265
  -0.25315028]
 [ 0.17054656  0.24721685  0.18980955 ...  0.00898315 -0.02774275
  -0.12952904]]
<class 'numpy.ndarray'>
10000


In [82]:
# Converting X to a numpy array
X = np.array(X)

In [83]:
X.shape

(10000, 100)

### 6. Modelling

##### 6.1. Average W2V & Random Forest

In [85]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


In [88]:
# Converting target variable(sentiment) into binary
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])

y

array([1, 1, 1, ..., 0, 1, 0])

In [89]:
# Train-Test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [90]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.829

##### 6.2. W2V(Google) & Random Forest

In [113]:
import gensim
from gensim.models import Word2Vec,KeyedVectors
from gensim.utils import simple_preprocess
from nltk import sent_tokenize

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [126]:
# Length of Vocabulary
print('Length of Vocabulary:', len(model.index_to_key))

Length of Vocabulary: 3000000


In [118]:
# Converting entire document(review) into a numerical vector
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.index_to_key]
    return np.mean(model[doc], axis=0)

# This is the first review
document_vector(df['review'].values[0])

array([ 3.56353521e-02,  5.20200655e-02,  3.68175954e-02,  8.18563327e-02,
       -5.47294319e-02,  2.73479684e-03,  5.26788309e-02, -7.66736791e-02,
        7.87198469e-02,  9.91729796e-02,  6.10370189e-03, -1.19252957e-01,
       -2.28140596e-03,  3.49528007e-02, -1.02423519e-01,  7.97183663e-02,
        3.08098514e-02,  1.13643803e-01, -1.04242647e-02, -7.79134855e-02,
        2.03028740e-03,  3.89883444e-02,  3.08787189e-02,  1.45070143e-02,
        3.85482907e-02, -6.07983358e-02, -5.43122329e-02,  6.53744861e-02,
        3.75130512e-02, -1.50223710e-02, -3.11216656e-02, -6.78879675e-03,
       -3.59876677e-02,  2.15049684e-02, -1.38433194e-02, -6.58503594e-03,
        4.87125069e-02, -2.29985919e-02,  4.27762093e-03,  6.26247972e-02,
        6.69497326e-02, -6.00897335e-02,  9.51667130e-02, -6.85305707e-03,
       -4.09635231e-02, -2.52993200e-02, -5.15104011e-02,  2.57383520e-03,
        2.16587959e-03,  9.52371745e-04, -5.67417294e-02,  2.17550006e-02,
       -3.27009498e-03, -

In [119]:
# Performing the above transformation for entire corpus

df = df[:10000]
from tqdm import tqdm

X = []

for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [3:32:04<00:00,  1.27s/it]


In [120]:
# Converting X to a numpy array
X = np.array(X)

In [121]:
X.shape

(10000, 300)

In [122]:
X

array([[ 0.03563535,  0.05202007,  0.0368176 , ..., -0.07887516,
         0.03912447, -0.01251649],
       [ 0.0802632 ,  0.04752473, -0.02344439, ..., -0.06213256,
         0.04468057,  0.0056236 ],
       [ 0.04868765,  0.07469928,  0.01612127, ..., -0.07238479,
         0.03263976, -0.0044024 ],
       ...,
       [ 0.08231123,  0.0142905 ,  0.01182684, ..., -0.04006061,
         0.02626477,  0.01509716],
       [ 0.05773104,  0.06734917, -0.00407253, ..., -0.04985777,
         0.03990832,  0.02976334],
       [ 0.06943762,  0.02008892,  0.02216622, ..., -0.06904491,
         0.03532594,  0.02165769]], dtype=float32)

In [None]:
y

In [123]:
# Train-Test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [124]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8095