In [None]:
import pandas as pd
import numpy as np

# Deep Learning Imports
import tensorflow as tf
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

# Imports for Pre-processing of Data
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [None]:
df_train = pd.read_csv('/kaggle/input/student-shopee-code-league-sentiment-analysis/train.csv', index_col='review_id')
print(len(df_train))
df_test = pd.read_csv('/kaggle/input/student-shopee-code-league-sentiment-analysis/test.csv', index_col='review_id')
print(len(df_test))

# Additional Data (thanks to Liuhh)

In [None]:
test_labelled = pd.read_csv('/kaggle/input/test-labelled/test_labelled.csv', index_col='review_id')
len(test_labelled)

In [None]:
comparison_df = df_test.merge(test_labelled, indicator=True, how='outer')
add_data = comparison_df[comparison_df['_merge'] == 'right_only']
add_data = add_data[['review', 'rating']]
df_train = df_train.append(add_data, ignore_index=True)
len(df_train)

# Pre-processing of reviews

In [None]:
df_train = df_train.sample(frac=1) # shuffle training set
sentences = list(df_train['review'])
labels = list(df_train['rating'])
df_train.head()

In [None]:
def process_sentences(sentences):
    
    sentences_clean = []
    
    for sentence in sentences:

        stemmer = PorterStemmer()
        stopwords_english = stopwords.words('english')
        # tokenize sentence
        tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                                   reduce_len=True)
        tokens = tokenizer.tokenize(sentence)

        sentence_clean = []
        for word in tokens:
            if (word not in stopwords_english and  # remove stopwords
                    word not in string.punctuation):  # remove punctuation
                stem_word = stemmer.stem(word)  # stemming word
                sentence_clean.append(stem_word)
                
        sentences_clean.append(sentence_clean)

    return sentences_clean

In [None]:
processed_sentences = process_sentences(sentences)

In [None]:
len(processed_sentences)

In [None]:
processed_sentences[1]

In [None]:
len(max(processed_sentences))

In [None]:
embedding_dim = 300
max_length = 13
oov_tok = "<OOV>"
training_size = len(sentences)
training_portion = 0.75

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(processed_sentences)

word_index = tokenizer.word_index
vocab_size = len(word_index)

sequences = tokenizer.texts_to_sequences(processed_sentences)
padded = pad_sequences(sequences, maxlen = max_length, padding = 'post',
                       truncating = 'post')

# Split Train / (val + test)
split_train = int(training_portion * training_size)
val_sequences = padded[split_train:]
training_sequences = padded[:split_train]
val_labels = np.array(labels[split_train:])
training_labels = np.array(labels[:split_train])

In [None]:
print(len(training_sequences))
print(len(val_sequences))

## Pre-Trained Embeddings
- Using the gloVe embeddings from Stanford

In [None]:
embeddings_index = {};
with open('/kaggle/input/glove6b300dtxt/glove.6B.300d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

## Simple Model Building + Training

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim,
                              input_length=max_length,
                              weights=[embeddings_matrix],
                              trainable=False),
    tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.1, recurrent_dropout=0.2, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(6, activation='softmax')
                                  ])

optimizer = tf.keras.optimizers.Adam(lr=2e-3)

model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
#----- to save your weights if you choose to ----#
#checkpoint_path = '/kaggle/working/sentiment_weights.ckpt'
#checkpoint_dir = os.path.dirname(checkpoint_path)
#cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
#                                                  save_weights_only=True,
#                                                  verbose=1)

In [None]:
# load any existing weights
#model.load_weights('')

In [None]:
history = model.fit(training_sequences, training_labels, epochs=1,
                    validation_data=(val_sequences, val_labels), 
                    verbose=1 #callbacks = [cp_callback])

# Predicting Test Set + Submission

#### Pre-processing Test set

In [None]:
test_sentiments = list(df_test.review)
processed_test = process_sentences(test_sentiments)
len(max(processed_test))

sequences_test = tokenizer.texts_to_sequences(processed_test)
padded_test = pad_sequences(sequences_test, maxlen = max_length, padding = 'post',
                       truncating = 'post')
padded_test

#### Predicting Test Set

In [None]:
predictions = model.predict(padded_test)
preds = predictions.argmax(axis=-1)
preds

In [None]:
df_test['rating'] = preds
submission = df_test[['rating']].reset_index()
submission