In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import callbacks, models, layers
import matplotlib.pyplot as plt

# tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# bag of words


In [None]:
BASE = '/kaggle/input/word2vec-nlp-tutorial'
MAX_WORDS = 25_000

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
train = pd.read_csv(os.path.join(BASE,'labeledTrainData.tsv.zip'), header=0, delimiter="\t", quoting=3)
train.shape

In [None]:
test = pd.read_csv(os.path.join(BASE,'testData.tsv.zip'), header=0, delimiter="\t", quoting=3)
test.shape

In [None]:
print(train["sentiment"][0])
train["review"][0]

In [None]:
stop_words = stopwords.words("english")
def clean(review):
    clean_html = BeautifulSoup(review).get_text()
    clean_non_letters = re.sub("[^a-zA-Z]", " ", clean_html)
    cleaned_lowercase = clean_non_letters.lower()
    words = cleaned_lowercase.split()
    cleaned_words = [w for w in words if w not in stop_words]
    return " ".join(cleaned_words)

train["cleaned_review"] = train["review"].apply(clean)
train

# Tokenization approach

In [None]:
# with no constraints there are 74_066 words in the training set
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train.cleaned_review)
total_words = len(tokenizer.word_index) + 1
total_words

In [None]:
sequences = tokenizer.texts_to_sequences(train.cleaned_review)
max_sequence_len = max([len(x) for x in sequences])
padded_sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))
labels = np.array(train.sentiment)
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

In [None]:
def to_dataset(data, labels):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    dataset = dataset.cache().shuffle(X_train.shape[0] + 1).batch(32).prefetch(tf.data.experimental.AUTOTUNE)
    return dataset
train_ds = to_dataset(X_train, y_train)
val_ds = to_dataset(X_val, y_val)

In [None]:
LSTM_SIZE = 8
def bi_lstm_model():
    model = models.Sequential()
    model.add(layers.Embedding(total_words, 16, input_length=max_sequence_len - 1))
    model.add(layers.Bidirectional(layers.LSTM(LSTM_SIZE)))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
    return model, f'bidirectional_lstm_{LSTM_SIZE}'

def lstm_model():
    model = models.Sequential()
    model.add(layers.Embedding(total_words, 4, input_length=max_sequence_len - 1))
    model.add(layers.LSTM(LSTM_SIZE))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
    return model, f'lstm_{LSTM_SIZE}'

In [None]:
def tokenizer_train(model, name):
    reducer = callbacks.ReduceLROnPlateau(monior='val_loss', factor=0.5, patience=3, mode='min', cooldown=1)
    stopper = callbacks.EarlyStopping(monitor='val_loss', patience=6, mode='min', restore_best_weights=True)
    hist = model.fit(train_ds,
              epochs=100,
              verbose=1,
              callbacks=[stopper, reducer],
              validation_data=val_ds)
    results = model.evaluate(val_ds)
    model.save(f'/kaggle/working/{name}')
    print(f"results: {results}, type: {type(results)}")
    return hist

In [None]:
model, name = lstm_model()
model.summary()

In [None]:
hist = tokenizer_train(model, name)

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(8,8), tight_layout=True)
    
axs[0].plot(hist.history['loss'])
axs[0].plot(hist.history['val_loss'])
axs[0].set_title('binary_crossentropy Loss')
axs[0].set_ylabel('Loss')
axs[0].set_xlabel('Epoch')
axs[0].legend(['train', 'val'], loc='upper right')

axs[1].plot(hist.history['binary_accuracy'])
axs[1].plot(hist.history['val_binary_accuracy'])
axs[1].set_title('binary_accuracy Metric')
axs[1].set_ylabel('Error')
axs[1].set_xlabel('Epoch')
axs[1].legend(['train', 'val'], loc='upper left')

axs[2].plot(hist.history['lr'])
axs[2].set_title('Learining Rate')
axs[2].set_ylabel('LR')
axs[2].set_xlabel('Epoch')
plt.savefig(f'/kaggle/working/{name}_graphs.png')
plt.show()

In [None]:
test["cleaned_review"] = test["review"].apply(clean)
test

In [None]:
sequences = tokenizer.texts_to_sequences(test.cleaned_review)
test_sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))
print(test_sequences.shape)

In [None]:
predictions = model.predict(test_sequences).flatten()

In [None]:
predictions.shape

In [None]:
output = pd.DataFrame(data={"id":test.id, "sentiment":predictions})
output.to_csv("word_tokenization_model.csv", index=False, quoting=3)