In [1]:
import sys, os

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path + "\src")

In [2]:
from preprocess import normalize_text
import pandas as pd

imdb_data=pd.read_csv('../data/normalized IMDB dataset.csv')
imdb_data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
#split the dataset  
#train dataset
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]
#test dataset
test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


In [9]:
#import nltk
# nltk.download(stopwords)
#imdb_data.apply(normalize_text)
# Result already saved in "normalized IMDB dataset.csv"

In [11]:
#normalized train reviews
norm_train_reviews=imdb_data.review[:40000]
norm_test_reviews=imdb_data.review[40000:]

## Different Word Representations
- Bag of Words
- TF-IDF
- Word2Vec - CBOW
- Word2Vec - Skipgram
- Glove
- FastText

In [14]:
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(imdb_data['sentiment'])
print(sentiment_data.shape)

#Spliting the sentiment data
train_sentiments=sentiment_data[:40000]
test_sentiments=sentiment_data[40000:]
print(train_sentiments)
print(test_sentiments)

(50000, 1)
[[1]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer

#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (40000, 6209089)
BOW_cv_test: (10000, 6209089)


In [13]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (40000, 6209089)
Tfidf_test: (10000, 6209089)


### Word2Vec Details

There are differet methods to get the sentence vectors :

**Doc2Vec** : you can train your dataset using Doc2Vec and then use the sentence vectors.
**Average of Word2Vec vectors** : You can just take the average of all the word vectors in a sentence. This average vector will represent your sentence vector.
**Average of Word2Vec vectors with TF-IDF** : this is one of the best approach which I will recommend. Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it will represent your sentence vector.

(reference: [link](https://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence))


Here, I used Doc2Vec approach using Gensim.
In Gensim, there are two implementations for Doc2Vec:

- Paragraph Vector - Distributed Memory (PV-DM) - corresponds to CBOW

- Paragraph Vector - Distributed Bag of Words (PV-DBOW) - corresponds to SkipGram

In [58]:
import multiprocessing
import gensim
from gensim.models import doc2vec

def generate_corpus(df, tokens_only = False):
    for index, row in df.iteritems():
        # print(row)
        tokens = gensim.utils.simple_preprocess(row)
        if tokens_only:
            yield tokens
        else:
            yield doc2vec.TaggedDocument(tokens, [index])
            
train_corpus = list(generate_corpus(norm_train_reviews))
test_corpus = list(generate_corpus(norm_test_reviews, tokens_only = True))


In [59]:
# dm = 0: PV-DM
# dm = 1: PV-DBOW
d2v_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=20, dm=1)

In [60]:
d2v_model.build_vocab(train_corpus)

In [61]:
print(f"Word 'good' appeared {model.wv.get_vecattr('good', 'count')} times in the training corpus.")

Word 'good' appeared 23340 times in the training corpus.


In [62]:
d2v_model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [63]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])

print(inferred_vector.shape)
print(inferred_vector)

(50,)
[ 0.3115324   0.19853513  0.75726795  0.16892147 -0.38990209  0.11132143
 -0.38945508  0.13970904 -0.08977677 -0.17892691  0.2863491  -0.47374254
  0.26724955  0.5288117   0.3347833  -0.205227    1.2687936   0.9980701
 -0.6963328  -0.03720236  0.32408106 -0.13349555  0.16489911 -0.2045367
  0.96313477  0.5707606  -0.96842825 -0.44279906  0.09607672 -1.1550183
 -0.49319273 -0.509282    0.50498056  0.07007471  0.4714886   0.19841962
  0.538801   -0.09300383 -0.45173794  0.04274365  1.4255229  -0.49658895
 -0.56747997  0.0910162   0.35376912 -0.19396082 -0.50346625  0.26528752
  0.44515797  1.1060053 ]


### GloVe Details


## Models
- Logistic Regression
- Support Vector Machine
- Multinomial Naive Bayes
- RNN based structure
- BERT