In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip', sep='\t')
df_test = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip', sep='\t')
add_train = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip', sep='\t')

In [None]:
print(df_train.info())
print(df_train.shape)
print(df_train)

In [None]:
print(df_test.info())
print(df_test.shape)
print(df_test)

In [None]:
print(add_train.info())
print(add_train.shape)
print(add_train)

In [None]:
all_train = pd.concat((df_train['review'], add_train['review']), axis=0, ignore_index=True)
print(all_train.isnull().sum())
print(all_train.shape)
print('\n')
print(all_train[49000])
print('\n')
print(all_train[1000])
print('\n')
print(all_train[6679])
print('\n')
print(all_train[18000])

**Pre-process and train a Word2Vec embedding layer**

In [None]:
#cleaning data
from bs4 import BeautifulSoup
import re
def clean_up(review):
    remove_html = BeautifulSoup(review,'html.parser').get_text()
    
    #remove punctuation and numbers
    words_only = re.sub(r'[^A-Za-z\']+',' ',remove_html)
    
    lower_words = words_only.lower()
    
    return lower_words

In [None]:
all_train = all_train.apply(clean_up)

In [None]:
print(all_train[18000])
print('\n')
print(all_train[6679])

In [None]:
from gensim.models import Phrases
sent = [para.split() for para in all_train]
bigram = Phrases(sent)

In [None]:
#testing the bigram - it should be able to detect phrases
print(bigram['the main character is facing a drug addiction'.split()])

In [None]:
from gensim.models import Word2Vec
embedding_dim = 256
w2v_model = Word2Vec(sentences=bigram[sent], vector_size=embedding_dim, min_count=1, window=2)

In [None]:
# w2v_model.save('word2v2_model')

In [None]:
print(w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))
print(w2v_model.wv.doesnt_match('breakfast cereal dinner lunch'.split()))
print(w2v_model.wv.similarity('woman','man'))
print(w2v_model.wv.most_similar('galaxy'))
print(w2v_model.wv.most_similar('action'))

In [None]:
# import gensim
# w2v_model = gensim.models.Word2Vec.load('../input/word2v-model/word2v_model')

In [None]:
# print(w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))
# print(w2v_model.wv.doesnt_match('breakfast cereal dinner lunch'.split()))
# print(w2v_model.wv.similarity('woman','man'))
# print(w2v_model.wv.most_similar('galaxy'))
# print(w2v_model.wv.most_similar('action'))

In [None]:
len(w2v_model.wv.index_to_key)

In [None]:
w2v_model.wv['lizard']

**Tokenization of training set, train test split, building embedding matrix**

In [None]:
df_train

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
ax = sns.countplot(x='sentiment', data=df_train, palette='Set3')
i = 0
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height +100,
           df_train['sentiment'].value_counts()[i], ha='center')
    i += 1

The provided training data is a perfect balance between positive and negative reviews.

In [None]:
#clean up training set
df_train['review'] = df_train['review'].apply(clean_up)

In [None]:
y = df_train['sentiment'].values
x = df_train['review'].values

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
from tensorflow.keras.layers import Input, Dense, GlobalMaxPool1D, Embedding, LSTM
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
seq_train = tokenizer.texts_to_sequences(x_train)
seq_val = tokenizer.texts_to_sequences(x_val)

In [None]:
word2idx = tokenizer.word_index
V = len(word2idx)
print('Found %s unique tokens' % V)

In [None]:
import statistics
words_per_review = [len(x) for x in x_train]
plt.figure(figsize=(15,8))
ax = sns.histplot(words_per_review)
ax.axvline(x=statistics.median(words_per_review), linestyle='--', c='orange', label='median')
_ = ax.legend()

In [None]:
maxlen = 4000
reviewbelow = len([x for x in words_per_review if x<maxlen])
tot_reviews = len(x_train)
reviewbelow / tot_reviews

In [None]:
seq_train = pad_sequences(seq_train, maxlen=maxlen)
seq_val = pad_sequences(seq_val, maxlen=maxlen)

In [None]:
len(seq_train[0])

In [None]:
len(seq_val[0])

In [None]:
list(word2idx.items())[:10]

In [None]:
w2v_model.wv['backdrop']

In [None]:
embedding_dim = 256 #to match the embedding dim used to train w2v model
hits = 0
misses = 0

#prepare embedding matrix
embedding_matrix = np.zeros((V+1, embedding_dim))
for word, i in word2idx.items():
    try:
        embedding_vector = w2v_model.wv[word]
        #Words not found in w2v model will be all-zeros.
        #This includes the representation of OOV and padding
        embedding_matrix[i] = embedding_vector
        hits += 1
    except:
        misses += 1
    
print('Converted %d words (%d misses)' % (hits, misses))
    

In [None]:
embedding_dim

**Load the trained Word2Vec embedding matrix into an Embedding layer for Keras**

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    V+1, 
    embedding_dim,
    trainable=False,
    input_length = maxlen,
    weights = [embedding_matrix],
)

In [None]:
#setting callbacks/regularization
from tensorflow.keras import callbacks
early_stop = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    min_delta=0.0000001,
    restore_best_weights=True,
)

plateau = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor = 0.2,                                     
    patience = 2,                                   
    min_delta = 0.0000001,                                
    cooldown = 0,                               
    verbose = 1
) 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout

M = 128
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(units=M, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy', metrics='accuracy')

In [None]:
r = model.fit(seq_train, y_train, epochs=20, validation_data=(seq_val, y_val),callbacks=[early_stop, plateau])

In [None]:
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
_ = plt.legend()

In [None]:
plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
_=plt.legend()

In [None]:
# model.save('movie_review.h5')
# model = tf.keras.models.load_model('../input/load-review/movie_review.h5')
# model.summary()

In [None]:
df_test['review'] = df_test['review'].apply(clean_up)
x_pred = df_test['review'].values
seq_xpred = tokenizer.texts_to_sequences(x_pred)
pad_xpred = pad_sequences(seq_xpred, maxlen=maxlen)

In [None]:
y_pred = model.predict(pad_xpred)

In [None]:
y_pred = np.round(y_pred)

In [None]:
i = np.random.choice(range(len(y_pred)))
print('%s vs [%s]' %(y_pred[i],x_pred[i]))

In [None]:
i = np.random.choice(range(len(y_pred)))
print('%s vs [%s]' %(y_pred[i],x_pred[i]))

In [None]:
# Copy pasted IMDB reviews with ratings to check 
d = {1:'''I don't know that I've ever seen a movie more oppressively depressing and embarrassingly overwrought than "Detachment". And I've seen myself a few of 'em.

A stunning collection of acclaimed acting talent including Adrien Brody, Marcia Gay Harden, James Caan, Blythe Danner and William Peterson are uniformly wasted in this miserable mess. And that's damn hard to accomplish. It's as if the whole lot of 'em were somehow convinced to participate by Director Tony Kaye with a patronizing pitch along the lines of, "Hey. Let's all make an overly dramatic statement movie that sledgehammers the audience repeatedly over the skull about how the American public education system is egregiously failing our kids. And then, for good measure, we'll toss in some shots at the rotten state of Long Term Elderly Care Facilities while we're at it. Come on. It'll be awesome."

The resultant refuse is about as "detached" from "awesome" as one can possibly conjure.

There is certainly a high-caliber film to be made that draws desperately needed attention to the authentic and alarming issues raised in "Detachment". And despite apparent earnest efforts, this dispiriting debacle leaves us still wanting.'''}

pred = np.array(pd.Series(data=d).apply(clean_up))
seq_pred = tokenizer.texts_to_sequences(pred)
pad_xpred = pad_sequences(seq_pred, maxlen=maxlen)
model.predict(pad_xpred)

In [None]:
sample = pd.read_csv('../input/word2vec-nlp-tutorial/sampleSubmission.csv')
sample

In [None]:
submission = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip', sep='\t')
submission

In [None]:
def submit(y_pred):
    submission['sentiment'] = y_pred
    submission.to_csv('submission.csv', index=False, columns=['id','sentiment'])

submit(y_pred)