In [1]:
import numpy as np
import re
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
nltk.download('wordnet')
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import contractions
from keras.callbacks import ReduceLROnPlateau
from keras.preprocessing import text, sequence
from keras.models import Sequential 
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from keras.optimizers import Adam

lemmatizer = WordNetLemmatizer() 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Read train data
train = pd.read_csv("/content/drive/MyDrive/data/imdb_train.csv")

# Read test data
test = pd.read_csv("/content/drive/MyDrive/data/imdb_test.csv")

In [3]:
def clean_input_data(txt):
        TAG_RE = re.compile(r'<[^>]+>')                                  # html codes
        txt = TAG_RE.sub('', txt.lower())

        txt=txt.encode("ascii","ignore")                                 # emojis
        txt=txt.decode()

        txt=''.join(i for i in txt if not i.isdigit())                   # numbers
        txt = re.sub(r'[^\w\s]', ' ', txt)                               # punctuations

        txt = ' '.join([i for i in txt.split() if not i in STOPWORDS])   # stopwords

        txt=' '.join([i for i in txt.split() if len(i)>2])

        txt=contractions.fix(txt)

        txt=lemmatizer.lemmatize(txt)                                    # lematization
        return txt

In [4]:
train['text']=train['text'].apply(clean_input_data)
test['text']=test['text'].apply(clean_input_data)

In [5]:
# shuffle the train and test data
from sklearn.utils import shuffle

shuffle(train)
shuffle(test)

Unnamed: 0,text,label
286,first movie ever saw life back years old time ...,1
21974,tagline lucky ones died watching never watched...,0
4144,first entire script mostly improv adding fanta...,1
20883,saw cinema initial release ask world gone mad ...,0
4221,watched gundam time much better gundam wing wa...,1
...,...,...
1407,pleasure seeing saltimbanco live seeing video ...,1
8522,bizarre experiment astronaut abandoned moon al...,1
22428,film unbelievable level fails action film one ...,0
7614,camerawork certainly funky perhaps one dutch r...,1


In [6]:
#prepare the data for modelling

x_train = train.text
y_train = train.label
x_test = test.text
y_test = test.label

In [7]:
from keras.preprocessing import text, sequence
max_features = 10000
max_len = 128

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)

In [8]:
from keras.utils import pad_sequences

tokenized_train = tokenizer.texts_to_sequences(x_train)
X_train = pad_sequences(tokenized_train, maxlen=max_len)

tokenized_test = tokenizer.texts_to_sequences(x_test)
X_test = pad_sequences(tokenized_test, maxlen=max_len) 

In [9]:
# Set the path to the GloVe embeddings file
GLOVE_EMB_DIR = '/content/drive/MyDrive/data/glove.6B.50d.txt'

In [10]:
def get_coeffs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
embeddings_dict = dict(get_coeffs(*o.rstrip().rsplit(' ')) for o in open(GLOVE_EMB_DIR))

In [11]:
all_embs = np.stack(embeddings_dict.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
num_words = min(max_features, len(word_index))

embedding_matrix = np.random.normal(emb_mean, emb_std, (num_words, embed_size))

for word, i in word_index.items():
    
    if i >= num_words: continue
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  if (await self.run_code(code, result,  async_=asy)):


In [12]:
max_features = 10000
max_len = 128
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=2, verbose=1, factor=0.5, min_lr=0.00001)

In [13]:
model = Sequential()

model.add(Embedding(max_features, output_dim=embed_size, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Bidirectional(LSTM(units=128)))
model.add(Dropout(rate=0.8))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer=Adam(lr=0.002), loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(name, **kwargs)


In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 50)           500000    
                                                                 
 bidirectional (Bidirectiona  (None, 256)              183296    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 16)                4112      
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 687,425
Trainable params: 187,425
Non-trainable params: 500,000
____________________________________________

In [15]:
batch_size = 256
epochs=10
embed_size=50

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), callbacks=[learning_rate_reduction])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred = (y_pred > 0.6)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85     12500
           1       0.89      0.78      0.83     12500

    accuracy                           0.84     25000
   macro avg       0.85      0.84      0.84     25000
weighted avg       0.85      0.84      0.84     25000

