In [1]:
import sys, os

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path + "\src")

In [2]:
from preprocess import normalize_text
import pandas as pd

imdb_data=pd.read_csv('../data/normalized IMDB dataset.csv')
imdb_data.head(10)

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,0,one review ha mention watch 1 oz episod youll ...,positive
1,1,wonder littl product film techniqu veri unassu...,positive
2,2,thought thi wa wonder way spend time hot summe...,positive
3,3,basic famili littl boy jake think zombi hi clo...,negative
4,4,petter mattei love time money visual stun film...,positive
5,5,probabl alltim favorit movi stori selfless sac...,positive
6,6,sure would like see resurrect date seahunt ser...,positive
7,7,thi show wa amaz fresh innov idea 70 first air...,negative
8,8,encourag posit comment thi film wa look forwar...,negative
9,9,like origin gut wrench laughter like thi movi ...,positive


In [3]:
#split the dataset  
#train dataset
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]
#test dataset
test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


In [4]:
#import nltk
# nltk.download(stopwords)
#imdb_data.apply(normalize_text)
# Result already saved in "normalized IMDB dataset.csv"

In [5]:
#normalized train reviews
norm_train_reviews=imdb_data.review[:40000]
norm_test_reviews=imdb_data.review[40000:]

## Different Word Representations
- Bag of Words
- TF-IDF
- Word2Vec - CBOW
- Word2Vec - Skipgram
- Glove (TODO)
- FastText (TODO)

In [None]:
from sklearn.preprocessing import LabelBinarizer

#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(imdb_data['sentiment'])
print(sentiment_data.shape)

#Spliting the sentiment data
train_sentiments=sentiment_data[:40000]
test_sentiments=sentiment_data[40000:]
print(train_sentiments)
print(test_sentiments)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (40000, 6209089)
BOW_cv_test: (10000, 6209089)


In [9]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (40000, 6209089)
Tfidf_test: (10000, 6209089)


### Word2Vec Details

There are differet methods to get the sentence vectors :

**Doc2Vec** : you can train your dataset using Doc2Vec and then use the sentence vectors.

**Average of Word2Vec vectors** : You can just take the average of all the word vectors in a sentence. This average vector will represent your sentence vector.

**Average of Word2Vec vectors with TF-IDF** : this is one of the best approach which I will recommend. Just take the word vectors and multiply it with their TF-
IDF scores. Just take the average and it will represent your sentence vector.

(reference: [link](https://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence))


Here, I used Doc2Vec approach using Gensim.
In Gensim, there are two implementations for Doc2Vec:

- Paragraph Vector - Distributed Memory (PV-DM) - corresponds to CBOW

- Paragraph Vector - Distributed Bag of Words (PV-DBOW) - corresponds to SkipGram

Code adapted from [here](https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py).

In [38]:
import multiprocessing
import gensim
from gensim.models import doc2vec

def generate_corpus(df, tokens_only = False):
    for index, row in df.iteritems():
        # print(row)
        tokens = gensim.utils.simple_preprocess(row)
        if tokens_only:
            yield tokens
        else:
            yield doc2vec.TaggedDocument(words = tokens, tags = [index])
            
train_corpus = list(generate_corpus(norm_train_reviews))
test_corpus = list(generate_corpus(norm_test_reviews, tokens_only = True))


In [39]:
# dm = 0: PV-DM
# dm = 1: PV-DBOW
d2v_model_cbow = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=20, dm=1)
d2v_model_cbow.build_vocab(train_corpus)

print(f"Word 'good' appeared {d2v_model_cbow.wv.get_vecattr('good', 'count')} times in the training corpus.")

Word 'good' appeared 23340 times in the training corpus.


In [40]:
import time


start = time.time()

d2v_model_cbow.train(train_corpus, total_examples=d2v_model_cbow.corpus_count, epochs=d2v_model_cbow.epochs)

end = time.time()

print('time elapsed:', end - start)


time elapsed: 80.09964966773987


In [41]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = d2v_model_cbow.infer_vector(test_corpus[doc_id])

print(inferred_vector.shape)
print(inferred_vector)

(50,)
[-0.60858905  0.75907207  0.5305222  -0.6922508  -2.2326393  -0.18228646
  1.7934977   0.05564959 -0.42275926  1.1776123   0.33512872 -0.9082353
  1.9483105   1.0670248  -0.5156318  -0.15090711 -0.3368744  -1.1634853
 -0.9918529  -0.499774    0.9081558   0.5998203   0.48171657  0.5758911
  0.734256    2.1832852   0.7550474   0.48498774 -0.89031696 -0.01820174
  0.29942718  0.39104307 -0.06870332 -0.9074729   0.7520646  -0.86472875
  0.6876076   0.4599352   0.24738263 -0.75093395  1.03252    -1.1094545
 -0.9312001   0.8651036   0.50095767  0.8094793   0.12230206  0.7860469
  0.47571358  0.15042062]


In [48]:
import numpy as np

def word_vector(model, tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += model[word].reshape((1, size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec = vec / count
    return vec

In [51]:
wordvec_arrays = np.zeros((len(train_corpus), 50))

for i in range(len(train_corpus)):
    wordvec_arrays[i,:] = word_vector(d2v_model_cbow, train_corpus[i].words, 50)

wordvec_df = pd.DataFrame(wordvec_arrays)
print(wordvec_df.shape)

(40000, 50)


### GloVe Details


## Models
- Logistic Regression
- Support Vector Machine
- Multinomial Naive Bayes
- RNN based structure
- BERT