In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# extract zip files
import zipfile, os
input_dir = '/kaggle/input/word2vec-nlp-tutorial/'
work_dir = '/kaggle/working/'
zip_files = ['labeledTrainData.tsv.zip', 'testData.tsv.zip']
for zf in zip_files:
    zipfile.ZipFile(os.path.join(input_dir, zf), 'r').extractall('./')

os.listdir(work_dir)

In [None]:
# creating train dataframes
train_df = pd.read_csv(os.path.join(work_dir, 'labeledTrainData.tsv'), sep='\t')
train_df.head()

In [None]:
# test dataframe
test_df = pd.read_csv(os.path.join(work_dir, 'testData.tsv'), sep='\t')
test_df.head()

In [None]:
# shape
print("Train dataset Shape:", train_df.shape)
print("Test dataset Shape:", test_df.shape)

In [None]:
# train dataframe info
train_df.info()

In [None]:
# test dataframe info
test_df.info()

In [None]:
# Sentiment Classes Stats
print(train_df['sentiment'].value_counts())

In [None]:
# getting validation dataset 80:20
split_perc = 0.8
split_at = int(len(train_df['review'])*split_perc)
train_sentences = train_df['review'][:split_at]
train_labels = train_df['sentiment'][:split_at]

validation_sentences = train_df['review'][split_at:]
validation_labels = train_df['sentiment'][split_at:]

test_sentences = test_df['review']


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup 
english_stopwords = stopwords.words("english")

# cleaning sentences 
def clean_sentences(sentences):
    out = []
    for sentence in sentences:
        # Lowering
        sentence = sentence.lower()
        # Removing html
        sentence = BeautifulSoup(sentence,).get_text()
        # Removing Urls
        sentence = re.sub("https?:\/\/[\w+.\/]+", " ", sentence)
        # Remove non-letters
        sentence = re.sub("[^a-zA-Z]", " ", sentence) 
        # Removing stop words
        for word in english_stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ").replace("  ", " ")
        out.append(sentence)
    return out

train_sentences = clean_sentences(train_sentences)
validation_sentences = clean_sentences(validation_sentences)
test_sentences = clean_sentences(test_sentences)

In [None]:
print(train_sentences[:1])

In [None]:
import tensorflow as tf
import tensorflow_hub as hub 

In [None]:
# for performances
AUTOTUNE = tf.data.AUTOTUNE
# converting to TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels.to_numpy())).cache().prefetch(buffer_size=AUTOTUNE)
validation_dataset = tf.data.Dataset.from_tensor_slices((validation_sentences, validation_labels.to_numpy())).cache().prefetch(buffer_size=AUTOTUNE)


In [None]:
# transfer leraning from hub layer
hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2", input_shape=[], dtype=tf.string, trainable=True)
# Our model definition
model = tf.keras.Sequential([
    hub_layer,
    tf.keras.layers.Dense(16, activation='relu', kernel_regularizer = tf.keras.regularizers.l2(0.01)), 
    tf.keras.layers.Dropout(.2),
    tf.keras.layers.Dense(1)
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False), 
              metrics=['accuracy'])


In [None]:
# some smart callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5, monitor='val_loss')
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                        patience=5, mode='min',
                        verbose=1)
checkpoint_filepath = './model-best.h5'
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, 
                                                      monitor='val_accuracy',
                                                      mode='max',
                                                      save_best_only=True)


In [None]:
# training 
epochs_num=100
history = model.fit(train_dataset.shuffle(10000).batch(512),
                    epochs=epochs_num, 
                    validation_data=validation_dataset.batch(512),
                    callbacks=[early_stopping, reduce_lr, model_checkpoint]
                   )

In [None]:
# show loss and accuracy
import matplotlib.pyplot as plt
def show_loss_accuracy(history):
  acc = history.history['accuracy']
  val_acc = history.history['val_accuracy']
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  epochs_range = range(len(acc))

  plt.figure(figsize=(20, 5))
  plt.subplot(1, 2, 1)
  plt.plot(epochs_range, acc, label='Training Accuracy')
  plt.plot(epochs_range, val_acc, label='Validation Accuracy')
  plt.legend(loc='lower right')
  plt.title('Training and Validation Accuracy')
  plt.subplot(1, 2, 2)
  plt.plot(epochs_range, loss, label='Training Loss')
  plt.plot(epochs_range, val_loss, label='Validation Loss')
  plt.legend(loc='upper right')
  plt.title('Training and Validation Loss')
  plt.show()
show_loss_accuracy(history)

In [None]:
# loading best model trained
model.load_weights(checkpoint_filepath)
# prediction on test data
predictions = model.predict(test_sentences)
# apply a sigmoid because our model returns logits
predictions = tf.nn.sigmoid(predictions)
predictions = tf.where(predictions < 0.5, 0, 1)
test_df['sentiment'] = predictions.numpy()

In [None]:
test_df.head()

In [None]:
# submission csv creation
submission_df = test_df.copy()
submission_df.drop(['review'], axis=1, inplace=True)
submission_df.to_csv('submission.csv', index=False)