In [51]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
import re
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score,classification_report,roc_auc_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import SimpleRNN,LSTM,GRU,Embedding,Dense
from keras.models import Sequential,load_model

In [3]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
stop_words = set(stopwords.words('english'))

In [50]:
PATH = '/content/drive/MyDrive/NLP/NLP_projects/Classification/'

In [6]:
def load_data(path):
    df = pd.read_csv(path)
    X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], stratify=df['sentiment'],
                                                    test_size=0.33,
                                                    shuffle=True, 
                                                    random_state=42)
    train = pd.concat([X_train,y_train],axis=1)
    test = pd.concat([X_test,y_test],axis=1)
    return train,test

In [7]:
train,test=load_data(PATH)

In [8]:
train.head()

Unnamed: 0,review,sentiment
44272,"Sure, it had some of the makings of a good fil...",negative
33427,This mini-series is iconic of the Australian s...,positive
36331,When you see the cover of the DVD you're convi...,positive
25718,"When I saw this ""documentary"", I was disappoin...",negative
40140,Abysmal Indonesian action film from legendary ...,negative


In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<br />',' ',text)
    text = re.sub(r'[^A-Za-z0-9]',' ',text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [10]:
train['cleaned_review']=train.review.apply(lambda s:clean_text(s))
test['cleaned_review']=test.review.apply(lambda s:clean_text(s))

In [11]:
lb=LabelBinarizer()
train['sentiment']=lb.fit_transform(train['sentiment'])
test['sentiment']=lb.transform(test['sentiment'])

In [12]:
X_train, X_val, y_train, y_val = train_test_split(train['cleaned_review'], train['sentiment'],
                                                  stratify=train['sentiment'],
                                                  test_size=0.2,
                                                  random_state=333)

## <b> Keras Preprocessing and Tokenization

In [13]:
max_len = train['cleaned_review'].apply(lambda s: len(str(s).split())).max()

In [14]:
tokenizer=Tokenizer() #num_words=None)

In [15]:
tokenizer.fit_on_texts(list(X_train)+list(X_val))

In [57]:
tokenizer.fit_on_texts(list(test['cleaned_review']))

In [16]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [58]:
X_test_seq = tokenizer.texts_to_sequences(test['cleaned_review'])

In [17]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

In [59]:
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [18]:
word_idx = tokenizer.word_index

In [19]:
len(word_idx)

85253

## <b>Simple RNN 

In [20]:
# model_rnn=Sequential()

In [21]:
# model_rnn.add(Embedding(input_dim=len(word_idx)+1,
#                         output_dim=300,
#                         input_length=max_len
#                         ))
# model_rnn.add(SimpleRNN(100))
# model_rnn.add(Dense(1,activation='sigmoid'))

# model_rnn.compile(loss='binary_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])

In [22]:
# model_rnn.fit(X_train_pad,y_train,epochs=5,batch_size=64)

In [23]:
# model_rnn.save('/content/drive/MyDrive/NLP/NLP_projects/Classification/model_rnn')

In [25]:
model_rnn=load_model('/content/drive/MyDrive/NLP/NLP_projects/Classification/model_rnn')

In [27]:
pred_rnn = model_rnn.predict(X_val_pad)

In [42]:
print(f"The ROC_AUC score is {roc_auc_score(y_val,pred_rnn)}")

The ROC_AUC score is 0.9034353308086435


In [49]:
scores_rnn = model_rnn.evaluate(X_val_pad,y_val)
print(f"The accuracy score is {scores_rnn[1]*100}")

The accuracy score is 83.05970430374146


## <b>LSTM

In [43]:
model=Sequential()

In [44]:
model.add(Embedding(input_dim=len(word_idx)+1,
                        output_dim=300,
                        input_length=max_len
                        ))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1435, 300)         25576200  
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 25,736,701
Trainable params: 25,736,701
Non-trainable params: 0
_________________________________________________________________


In [45]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [46]:
model.fit(X_train_pad,y_train,epochs=5,batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f31040c8590>

In [47]:
pred_lstm = model.predict(X_val_pad)
print(f"The ROC_AUC score is {roc_auc_score(y_val,pred_lstm)}")

The ROC_AUC score is 0.939735442192025


In [48]:
scores_lstm = model.evaluate(X_val_pad,y_val)
print(f"The accuracy score is {scores_lstm[1]*100}")

The accuracy score is 87.14925646781921


## <b>Models with pretrained Glove Embedding

We will use a pretrained embedding from Glove of 50 dimensions for ease of loading

In [52]:
embeddings_index = {}
f = open(os.path.join(PATH, 'glove.6B.50d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [53]:
embedding_matrix = np.zeros((len(word_idx) + 1, 50))
for word, i in word_idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [54]:
model=Sequential()

model.add(Embedding(input_dim=len(word_idx)+1,
                    output_dim = 50,
                    input_length=max_len,
                    weights=[embedding_matrix],
                    trainable=False))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
              )

In [55]:
model.fit(X_train_pad,
          y_train,
          epochs=5,
          batch_size=128,
          validation_data=(X_val_pad,y_val)
          )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3048c70a50>

In [61]:
pred_emb_lstm = model.predict(X_test_pad)
print(f"The ROC_AUC score is {roc_auc_score(test['sentiment'],pred_emb_lstm)}")

The ROC_AUC score is 0.6141522497704315


In [62]:
scores_emb_lstm = model.evaluate(X_test_pad,test['sentiment'],verbose=0)
print(f"The accuracy score is {scores_emb_lstm[1]*100}")

The accuracy score is 57.52727389335632


Clear signs of overfitting, hence we train the model again with dropout

## <b> LSTM Model with dropout on Glove Embedding

In [67]:
model=Sequential()

model.add(Embedding(input_dim=len(word_idx)+1,
                    output_dim = 50,
                    input_length=max_len,
                    weights=[embedding_matrix],
                    trainable=False))
model.add(LSTM(100,dropout=0.3))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
              )

In [68]:
model.fit(X_train_pad,
          y_train,
          epochs=10,
          batch_size=128,
          validation_data=(X_val_pad,y_val)
          )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f304826c310>

In [69]:
pred_emb_drp = model.predict(X_test_pad)
print(f"The ROC_AUC score is {roc_auc_score(test['sentiment'],pred_emb_drp)}")

The ROC_AUC score is 0.6226493296602388


In [70]:
scores_emb_drp = model.evaluate(X_test_pad,test['sentiment'],verbose=0)
print(f"The accuracy score is {scores_emb_drp[1]*100}")

The accuracy score is 58.454543352127075
