In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import gensim.downloader as api

In [2]:
reviews_df=pd.read_csv('/Users/kevinkomban/Documents/Databyte Inductions/IMDB Dataset.csv/IMDB Dataset.csv')
reviews_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
def preprocessed_text(text):
    text=text.lower()
    text_tokenized=word_tokenize(text)
    tokens=[word for word in text_tokenized if word.isalnum()]
    stop_words=set(stopwords.words('english'))
    tokens=[word for word in tokens if word not in stop_words]
    postag=nltk.pos_tag(tokens)
    lemmatizer=WordNetLemmatizer()
    lemmatized_tokens=[]
    for i in range(len(tokens)):
        if postag[i][1][0]=='J':
            POS='a'
        elif postag[i][1][0]=='R':
            POS='r'
        elif postag[i][1][0]=='V':
            POS='v'
        else:
            POS='n'
        lemmatized_tokens.append(lemmatizer.lemmatize(tokens[i],pos=POS))
    processed_text=' '.join(lemmatized_tokens)
    return processed_text

reviews_df['Processed Text']=reviews_df.review.apply(preprocessed_text)
reviews_df

Unnamed: 0,review,sentiment,Processed Text
0,One of the other reviewers has mentioned that ...,positive,one reviewer mention watch 1 oz episode hook r...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br film techniq...
2,I thought this was a wonderful way to spend ti...,positive,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,think movie right good job creative original f...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad act idiotic direct a...
49997,I am a Catholic taught in parochial elementary...,negative,catholic teach parochial elementary school nun...
49998,I'm going to have to disagree with the previou...,negative,go disagree previous comment side maltin one s...


In [4]:
model=gensim.models.Word2Vec(window=10, min_count=2, workers=6)

In [6]:
vocab=reviews_df['Processed Text'].apply(word_tokenize)
model.build_vocab(vocab, progress_per=1000)

In [7]:
model.epochs

5

In [8]:
model.train(vocab, total_examples=model.corpus_count, epochs=model.epochs)

(26745061, 29588505)

In [9]:
model.wv.most_similar("good")

[('decent', 0.7438089847564697),
 ('great', 0.6393566727638245),
 ('okay', 0.627350389957428),
 ('alright', 0.6259850859642029),
 ('nice', 0.6176878809928894),
 ('well', 0.6053310036659241),
 ('bad', 0.598610520362854),
 ('liked', 0.5972123146057129),
 ('ok', 0.5852205753326416),
 ('cool', 0.5850392580032349)]

In [10]:
def avg_vector(tokens,model):
    vectors=[model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
reviews_df['Vector']=vocab.apply(lambda x: avg_vector(x, model))

In [11]:
def sentiment_encoding(sentiment):
    if sentiment=='positive':
        return 1
    elif sentiment=='negative':
        return 0

reviews_df.sentiment=reviews_df.sentiment.apply(sentiment_encoding)
reviews_df

Unnamed: 0,review,sentiment,Processed Text,Vector
0,One of the other reviewers has mentioned that ...,1,one reviewer mention watch 1 oz episode hook r...,"[0.13700365, 0.513862, 0.020551864, 0.14302915..."
1,A wonderful little production. <br /><br />The...,1,wonderful little production br br film techniq...,"[-0.18619852, 0.387883, -0.2629296, -0.1021071..."
2,I thought this was a wonderful way to spend ti...,1,think wonderful way spend time hot summer week...,"[0.14523587, 0.4257014, 0.06414919, -0.0352010..."
3,Basically there's a family where a little boy ...,0,basically family little boy jake think zombie ...,"[0.07970727, 0.35621127, 0.02770987, -0.012006..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei love time money visually stunnin...,"[-0.023460563, 0.21145345, 0.35270998, -0.3727..."
...,...,...,...,...
49995,I thought this movie did a down right good job...,1,think movie right good job creative original f...,"[0.2740907, 0.32166064, -0.044608966, -0.02645..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,bad plot bad dialogue bad act idiotic direct a...,"[-0.2908178, 0.44513807, 0.07619937, -0.148340..."
49997,I am a Catholic taught in parochial elementary...,0,catholic teach parochial elementary school nun...,"[0.0150441965, 0.2752515, 0.18205117, 0.293922..."
49998,I'm going to have to disagree with the previou...,0,go disagree previous comment side maltin one s...,"[-0.24904491, 0.32029867, 0.10082516, 0.151004..."


In [13]:
X = np.stack(reviews_df.Vector.values)
Y=reviews_df.sentiment
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
model=LogisticRegression()
model.fit(X_train,Y_train)
Y_pred=model.predict(X_test)
print(f'Accuracy: {accuracy_score(Y_test,Y_pred)}')
print(f'F1 Score: {f1_score(Y_test,Y_pred)}')
print(f'confusion Matrix:\n{confusion_matrix(Y_test,Y_pred)}')

Accuracy: 0.8632
F1 Score: 0.8639076800636689
confusion Matrix:
[[4290  698]
 [ 670 4342]]
