In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# extract zip files
import zipfile, os
input_dir = '/kaggle/input/word2vec-nlp-tutorial/'
work_dir = '/kaggle/working/'
zip_files = ['labeledTrainData.tsv.zip', 'testData.tsv.zip']
for zf in zip_files:
    zipfile.ZipFile(os.path.join(input_dir, zf), 'r').extractall('./')

os.listdir(work_dir)

In [None]:
# creating train dataframes
train_df = pd.read_csv(os.path.join(work_dir, 'labeledTrainData.tsv'), sep='\t')
train_df.head()

In [None]:
# test dataframe
test_df = pd.read_csv(os.path.join(work_dir, 'testData.tsv'), sep='\t')
test_df.head()

In [None]:
# shape
print("Train dataset Shape:", train_df.shape)
print("Test dataset Shape:", test_df.shape)

In [None]:
# train dataframe info
train_df.info()

In [None]:
# test dataframe info
test_df.info()

In [None]:
# Sentiment Classes Stats
print(train_df['sentiment'].value_counts())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# words distribution over sentences 
def eda_words(sentences):
    sentences_df = pd.DataFrame({'review': sentences})
    sentences_df['word_n'] = sentences_df['review'].apply(lambda x : len(x.split(' ')))
    fig=plt.figure(figsize=(50,4))
    fig.add_subplot(1,2,1)
    sns.histplot(data=sentences_df['word_n'], color='blue').set_title('Words Number Distribution')
eda_words(train_df['review'])

In [None]:
# getting validation dataset 80:20
split_perc = 0.8
split_at = int(len(train_df['review'])*split_perc)
train_sentences = train_df['review'][:split_at]
train_labels = train_df['sentiment'][:split_at]

validation_sentences = train_df['review'][split_at:]
validation_labels = train_df['sentiment'][split_at:]

test_sentences = test_df['review']


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup 
english_stopwords = stopwords.words("english")

# cleaning sentences 
def clean_sentences(sentences):
    out = []
    for sentence in sentences:
        # Lowering
        sentence = sentence.lower()
        # Removing html
        sentence = BeautifulSoup(sentence,).get_text()
        # Removing Urls
        sentence = re.sub("https?:\/\/[\w+.\/]+", " ", sentence)
        # Remove non-letters
        sentence = re.sub("[^a-zA-Z]", " ", sentence) 
        # Removing stop words
        for word in english_stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ").replace("  ", " ")
        out.append(sentence)
    return out

train_sentences = clean_sentences(train_sentences)
validation_sentences = clean_sentences(validation_sentences)
test_sentences = clean_sentences(test_sentences)

In [None]:
print(train_sentences[:1])

In [None]:
# words distribution over sentences after cleaning
eda_words(train_sentences)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# get vocabulary with Tokenizer 
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
print("TOTAL WORDS:", len(word_index))

In [None]:
# words frequency graph
from collections import OrderedDict 
words_count = tokenizer.word_counts
ordered_words_count = (OrderedDict(sorted(words_count.items(), key=lambda t: t[1], reverse=True))) 
plt.plot(range(len(ordered_words_count)),ordered_words_count.values())
#plt.axis([0,10000,0,2000])
plt.show()



In [None]:
# over 5000 word index we have few examples
vocab_size = 5000
# over 500 word count we have few sentences
sequence_length = 500

In [None]:
# get vocabulary with Tokenizer 
tokenizer = Tokenizer(oov_token="<OOV>", num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

In [None]:
# sequencing and padding
# sequences of numbers from sentences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
                                               
# padding
train_padded = pad_sequences(train_sequences, padding='post', maxlen=sequence_length)
validation_padded = pad_sequences(validation_sequences, padding='post', maxlen=sequence_length)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=sequence_length)
print(sequence_length)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub 

In [None]:
# for performances
AUTOTUNE = tf.data.AUTOTUNE
# converting to TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_padded, train_labels.to_numpy())).cache().prefetch(buffer_size=AUTOTUNE)
validation_dataset = tf.data.Dataset.from_tensor_slices((validation_padded, validation_labels.to_numpy())).cache().prefetch(buffer_size=AUTOTUNE)


In [None]:
# how to decode an encoded sequence of numbers
reversed_word_index = { v: k for k, v in word_index.items() }
def decode_review(sequence):
    return ' '.join([reversed_word_index.get(i, '?') for i in sequence if i != 0])

# verify the decoding of an example sentence
print("Original Sentence: ", train_sentences[0])
print("Encoded Sequence of numbers: ", train_padded[0])
print("Decoded Sequence: ", decode_review(train_padded[0]))

In [None]:
# GloVe has been trained from Billions words and has several space dimensions
glove_wikipedia = False
if glove_wikipedia:
    # download and load embedding weight from GloVe! https://nlp.stanford.edu/projects/glove/ 
    # Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors, 822 MB download):
    !wget --no-check-certificate http://nlp.stanford.edu/data/glove.6B.zip -O /tmp/glove.6B.100d.txt.zip
    !unzip /tmp/glove.6B.100d.txt.zip -d /tmp
    glove_filename = 'glove.6B.100d.txt'
    embedding_dim = 100
else:
    # Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors, 1.42 GB download)
    !wget --no-check-certificate https://nlp.stanford.edu/data/glove.twitter.27B.zip  -O /tmp/glove.twitter.27B.zip
    !unzip /tmp/glove.twitter.27B.zip -d /tmp
    glove_filename = 'glove.twitter.27B.25d.txt'
    embedding_dim = 25

In [None]:
# loading original embedding matrix
embeddings_index = {}
with open(f"/tmp/{glove_filename}") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# trasforming original embedding weights through our vocabulary
embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and i < vocab_size+1:
        embeddings_matrix[i] = embedding_vector

In [None]:
print(embedding_dim)

In [None]:
# Our model definition
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, weights=[embeddings_matrix], trainable=True),   
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)), 
    tf.keras.layers.Dropout(.2), 
    tf.keras.layers.Dense(64, activation='relu'),     
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam() , 
              metrics=['accuracy'])

In [None]:
# some smart callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5, monitor='val_loss')
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                        patience=5, mode='min',
                        verbose=1)
checkpoint_filepath = './model-best.h5'
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, 
                                                      monitor='val_accuracy',
                                                      mode='max',
                                                      save_best_only=True)



In [None]:
# training 
epochs_num=50
history = model.fit(train_dataset.shuffle(10000).batch(512),
                    epochs=epochs_num, 
                    validation_data=validation_dataset.batch(512),
                    callbacks=[early_stopping, reduce_lr, model_checkpoint]
                   )

In [None]:
# show loss and accuracy
def show_loss_accuracy(history):
  acc = history.history['accuracy']
  val_acc = history.history['val_accuracy']
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  epochs_range = range(len(acc))

  plt.figure(figsize=(20, 5))
  plt.subplot(1, 2, 1)
  plt.plot(epochs_range, acc, label='Training Accuracy')
  plt.plot(epochs_range, val_acc, label='Validation Accuracy')
  plt.legend(loc='lower right')
  plt.title('Training and Validation Accuracy')
  plt.subplot(1, 2, 2)
  plt.plot(epochs_range, loss, label='Training Loss')
  plt.plot(epochs_range, val_loss, label='Validation Loss')
  plt.legend(loc='upper right')
  plt.title('Training and Validation Loss')
  plt.show()
show_loss_accuracy(history)

In [None]:
# loading best model trained
model.load_weights(checkpoint_filepath)
# prediction on test data
predictions = model.predict(test_padded)
# apply a sigmoid because our model returns logits
predictions = tf.nn.sigmoid(predictions)
predictions = tf.where(predictions < 0.5, 0, 1)
test_df['sentiment'] = predictions.numpy()

In [None]:
test_df.head()

In [None]:
# submission csv creation
submission_df = test_df.copy()
submission_df.drop(['review'], axis=1, inplace=True)
submission_df.to_csv('submission.csv', index=False)