In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.keras as kr
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings

warnings.filterwarnings('ignore')


NUM_CLASSES = 5
EPOCHS = 50
BATCHS = 128
PATIENCE = 15
SEED = 57


tf.random.set_seed(SEED)
DATA_DIR = '/content/drive/MyDrive/datasets/sentiment-analysis-on-movie-reviews'

In [3]:
! ls /content/drive/MyDrive/datasets/sentiment-analysis-on-movie-reviews

sampleSubmission.csv  test.tsv	train.tsv


In [4]:
df_tv = pd.read_csv(DATA_DIR+'/train.tsv', sep='\t', index_col='PhraseId')
df_tt = pd.read_csv(DATA_DIR+'/test.tsv', sep='\t', index_col='PhraseId')

In [5]:
df_tv.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [6]:
df_tv.Sentiment.unique()

array([1, 2, 3, 4, 0])

In [7]:
df_tv.drop(columns=['SentenceId'], inplace=True)
df_tt.drop(columns=['SentenceId'], inplace=True)
df_tv.columns

Index(['Phrase', 'Sentiment'], dtype='object')

In [8]:
def clean_text(s):
  s = s.lower()
  s = s.replace("'", "")
  tokens = []
  for w in s.split():
    if w.isalpha(): tokens.append(w)
  return ' '.join(tokens)

In [9]:
df_tv.Phrase = df_tv.Phrase.apply(clean_text)
df_tt.Phrase = df_tt.Phrase.apply(clean_text)

In [10]:
df_tr, df_vl = train_test_split(df_tv, test_size=0.2, stratify=df_tv['Sentiment'], random_state=SEED)
df_tr.shape, df_vl.shape

((124848, 2), (31212, 2))

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_tr.Phrase)

In [12]:
xtr = tokenizer.texts_to_sequences(df_tr.Phrase)
xvl = tokenizer.texts_to_sequences(df_vl.Phrase)

In [13]:
len(tokenizer.word_counts)

14657

In [14]:
max([len(x) for x in xtr])

48

In [15]:
input_length = 64
embedding_length = 128
vocab_size = 15000

In [16]:
xtr = pad_sequences(xtr, maxlen=input_length)
xvl = pad_sequences(xvl, maxlen=input_length)
xtr.shape, xvl.shape

((124848, 64), (31212, 64))

In [17]:
model = kr.models.Sequential([
    kr.layers.Input((input_length,)),
    kr.layers.Embedding(input_dim=vocab_size, output_dim=embedding_length),
    kr.layers.Bidirectional(kr.layers.LSTM(64, return_sequences=True)),
    kr.layers.Bidirectional(kr.layers.LSTM(64, return_sequences=False)),
    kr.layers.Dense(512, activation='relu'),
    kr.layers.Dropout(0.2),
    kr.layers.Dense(NUM_CLASSES, activation='softmax')
])

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 64, 128)           1920000   
                                                                 
 bidirectional (Bidirectiona  (None, 64, 128)          98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 512)               66048     
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 5)                 2

In [19]:
model.compile(optimizer=kr.optimizers.Adam(learning_rate=0.001), loss=kr.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])

In [20]:
early_stopping = kr.callbacks.EarlyStopping(monitor='val_accuracy', patience=PATIENCE, restore_best_weights=True)

In [21]:
model.fit(xtr, df_tr.Sentiment, epochs=EPOCHS, batch_size=BATCHS, validation_data=(xvl, df_vl.Sentiment), callbacks=[early_stopping], verbose=True, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


<keras.callbacks.History at 0x7f438a442150>

In [22]:
model.evaluate(xvl, df_vl.Sentiment)



[0.8361499309539795, 0.6692618131637573]