In [2]:
import os
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
import random
# from tensorflow import set_random_seed
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
print(tf.__version__)

2.0.0-alpha0


In [4]:
train_data = pd.read_csv("Dataset/train.csv")
test_data = pd.read_csv("Dataset/test.csv")

In [5]:
train_data[:10]

Unnamed: 0,Id,Review,Score
0,1,\n So this is what it takes bands nowadays to ...,0.0
1,2,\nTotal Strife Forever is an eclectic electron...,3.0
2,3,\n Sonically this is closest to Isolation Dril...,2.0
3,4,\n Informative and representative of the time....,4.0
4,5,"\n Things like ""streamlined"" and ""accessible"" ...",2.0
5,6,\n Vince Gill is an amazing artist - magnifice...,4.0
6,7,\n theres nuthin really much to argue about bj...,4.0
7,8,\nAOTY... Production is insane. Cudi is back t...,4.0
8,9,\n This band is on the rise. From the minute y...,4.0
9,10,\nA solid piece of work that features Lorde at...,2.0


In [6]:
train_data.shape

(35833, 3)

In [7]:
test_data.shape

(17650, 2)

In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from bs4 import BeautifulSoup
import re
import nltk


In [9]:
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['Review']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

In [10]:
#cleaned reviews for both train and test set retrieved
train_sentences = clean_sentences(train_data)
test_sentences = clean_sentences(test_data)
print(len(train_sentences))
print(len(test_sentences))

100%|██████████| 35833/35833 [00:32<00:00, 1091.79it/s]
100%|██████████| 17650/17650 [00:15<00:00, 1128.92it/s]

35833
17650





In [11]:
target = train_data.Score.values
y_target =  to_categorical(target)
num_classes = y_target.shape[1]

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_sentences, y_target, test_size = 0.2
                                                 , stratify = y_target)

In [23]:
#  X_train = [str (item) for item in X_train]
# X_val = [str (item) for item in X_val]

In [24]:

unique_words = set()
len_max = 0

for sent in tqdm(X_train):
    
    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max = len(sent)
        
#length of the list of unique_words gives the no of unique words
print(len(list(unique_words)))
print(len_max)

100%|██████████| 28666/28666 [00:00<00:00, 131222.78it/s]

42726
1999





In [25]:
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))

#texts_to_sequences(texts)

    # Arguments- texts: list of texts to turn to sequences.
    #Return: list of sequences (one per text input).
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test_sentences)

#padding done to equalize the lengths of all input reviews. LSTM networks needs all inputs to be same length.
#Therefore reviews lesser than max length will be made equal using extra zeros at end. This is padding.

X_train = pad_sequences(X_train, maxlen=len_max)
X_val = pad_sequences(X_val, maxlen=len_max)
X_test = pad_sequences(X_test, maxlen=len_max)

print(X_train.shape,X_val.shape,X_test.shape)

(28666, 1999) (7167, 1999) (17650, 1999)


In [26]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor='val_accuracy', patience = 2)
callback = [early_stopping]

In [29]:

model=tf.keras.Sequential()
model.add( tf.keras.layers.Embedding(len(list(unique_words)),16,
                                     input_length=len_max))
# model.add(  tf.keras.layers.Flatten())
model.add(tf.keras.layers.LSTM(128,dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
model.add(tf.keras.layers.LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
# model.add( tf.keras.layers.LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model.add( tf.keras.layers.Dense(100,activation='relu'))
model.add( tf.keras.layers.Dropout(0.5))
model.add( tf.keras.layers.Dense(num_classes,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer=RMSprop(lr=0.005),metrics=['accuracy'])
model.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1999, 16)          683616    
_________________________________________________________________
unified_lstm_2 (UnifiedLSTM) (None, 1999, 128)         74240     
_________________________________________________________________
unified_lstm_3 (UnifiedLSTM) (None, 64)                49408     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               6500      
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 505       
Total params: 814,269
Trainable params: 814,269
Non-trainable params: 0
________________________________________________

In [None]:
history=model.fit(X_train, y_train, validation_data=(X_val, y_val),
                  epochs=6, batch_size=128, verbose=1, callbacks=callback)

Train on 28666 samples, validate on 7167 samples
Epoch 1/6


In [None]:


import matplotlib.pyplot as plt

# Create count of the number of epochs
epoch_count = range(1, len(history.history['loss']) + 1)

# Visualize learning curve. Here learning curve is not ideal. It should be much smoother as it decreases.
#As mentioned before, altering different hyper parameters especially learning rate can have a positive impact
#on accuracy and learning curve.
plt.plot(epoch_count, history.history['loss'], 'r--')
plt.plot(epoch_count, history.history['val_loss'], 'b-')
plt.legend(['Training Loss', 'Validation Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

