In [1]:
import pandas as pd 
from collections import Counter
import string
import scipy as np
from tqdm import tqdm
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from random import randint
from datetime import datetime
import re
import time
import warnings 
warnings.filterwarnings('ignore')

import tensorflow as tf

Using TensorFlow backend.


In [2]:
data = pd.read_csv('./Reviews.csv').dropna()
print(data.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')


In [3]:
data = data[['Text', 'Summary', 'Score']]
data.drop_duplicates(inplace=True)
data = data[data['Score'] != 3] 
data['Score'] = [1 if item>3 else 0 for item in data['Score'].values]

In [4]:
data['Score'].unique()

array([1, 0])

In [5]:
def preprocess(x):
    # x = BeautifulSoup(x, 'lxml').get_text()
    x = re.sub('<[^>]*>', '', x)
    for punc in string.punctuation:
        if punc != "\'":
            x = x.replace(punc, f' {punc} ')
    return ' '.join(x.split()).lower()

data['Text'] = [preprocess(item) for item in data['Text'].values]
data['Summary'] = [preprocess(item) for item in data['Summary'].values]

In [6]:
X_data = [i+' '+j for i,j in zip(list(data['Summary'].values), list(data['Text'].values))]
Y_data = list(data['Score'].values)

In [7]:
corpus = dict(Counter(' '.join(X_data).split()))
print('Number of unique tokens:', len(corpus))

Number of unique tokens: 146896


In [8]:
min_word_count = np.percentile(list(corpus.values()), 90)
print('Minimum frequency of words:', min_word_count)

Minimum frequency of words: 34.0


In [9]:
words = list(corpus.keys())
for w in words:
    if corpus[w] < min_word_count:
        del corpus[w]

print('Number of unique tokens after deleting less frequent tokens:', len(corpus))

Number of unique tokens after deleting less frequent tokens: 14702


In [10]:
seq_len = [len(item.split()) for item in X_data]

suitable_seq_len = int(np.percentile(seq_len, 90))
print('Suitable sequence length:', suitable_seq_len)

Suitable sequence length: 186


In [11]:
# Creating the word ids
word_ids = {
    item: index+2 for index, item in enumerate(corpus.keys())
}

In [12]:
X_data_int = []; Y_data_new = []
for item, y in zip(X_data, Y_data):
    temp = [word_ids.get(word, 1) for word in item.split()]
    if temp:
        X_data_int.append(temp)
        Y_data_new.append(y)

In [13]:
X_data_int = sequence.pad_sequences(X_data_int, maxlen=suitable_seq_len)
print('Sample X_data with word ids:', X_data_int[0])
print('Sample X_data with proper words:', X_data[0])

Sample X_data with word ids: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  2  3  4  5  6  7  8  9 10 11 12 13  4
  5 14 15  7 16 17 18 19 20 10  2  3 21 11 22 23 24 25 26 27 28 26 29 30
 15 31 32 33 21 34 35 36 37 15 38 39 40 22 33 28 41 21]
Sample X_data with proper words: good quality dog food i have bought several of the vitality canned dog food products and have found them all to be of good quality . the product looks more like a stew than a processed meat and it smells better . my labrador is finicky and she appreciates this product better than most .


In [14]:
def one_hot_maker(x):
    with tf.Session() as sess:
         return sess.run(tf.one_hot(x, depth=len(np.unique(x))))
    
Y_data_new = one_hot_maker(Y_data_new)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_data_int, Y_data_new, test_size=0.027, random_state=101)
print(len(X_train), len(X_test))

355290 9860


## CONSTRUCTION AND TRAINING

In [16]:
# defining the configuration

config = {
    'rnn_size': 5,
    'rnn_layer': 2,
    'sequence_length': suitable_seq_len,
    'word_embedding_size': 300,
    'vocab_size': len(corpus)+2,
    'learning_rate': 3e-4,
    'batch_size': 1000,
    'epoch': 250,
    'num_classes': len(y_train[0]),
    'dropout_lstm': .5,
    'dropout_dense': .5,
    'dense_unit_size': 20,
}

data_x = tf.placeholder(name='data_x', dtype=tf.int64, shape=[None, config['sequence_length']])
target = tf.placeholder(name='target', dtype=tf.float32, shape=[None, config['num_classes']])

In [17]:
config

{'rnn_size': 2,
 'rnn_layer': 2,
 'sequence_length': 186,
 'word_embedding_size': 300,
 'vocab_size': 14704,
 'learning_rate': 0.0003,
 'batch_size': 1000,
 'epoch': 250,
 'num_classes': 2,
 'dropout_lstm': 0.5,
 'dropout_dense': 0.5,
 'dense_unit_size': 20}

In [18]:
def get_embedding():
    with tf.variable_scope('get_embedding', reuse=tf.AUTO_REUSE):
        word_embedding = tf.get_variable('word_embedding', [config['vocab_size'], config['word_embedding_size']])
        return word_embedding

In [19]:
def get_lstm_cell(): 
    lstm_single_layer = tf.contrib.rnn.BasicLSTMCell(config['rnn_size'], state_is_tuple=True)
    dropout = tf.contrib.rnn.DropoutWrapper(lstm_single_layer, output_keep_prob=config['dropout_lstm'])
    return dropout

In [20]:
def create_rnn():
    cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell() for _ in range(config['rnn_layer'])], state_is_tuple=True)
    return cell

In [21]:
# defining model 

def model():
    word_embedding = get_embedding()
    embedded_words = tf.nn.embedding_lookup(word_embedding, data_x)
    CELL = create_rnn()
    with tf.variable_scope('model', reuse=tf.AUTO_REUSE):
        OUTPUT, FINAL_STATE = tf.nn.dynamic_rnn(CELL, embedded_words, dtype=tf.float32)
    OUTPUT = tf.transpose(OUTPUT, [1,0,2])
    OUTPUT = tf.gather(OUTPUT, OUTPUT.get_shape()[0]-1)
#     weights = tf.truncated_normal_initializer(mean=0., stddev=.1)
    weights = tf.keras.initializers.he_normal(seed=None)
    biases = tf.zeros_initializer()
    dense = tf.contrib.layers.fully_connected(OUTPUT, num_outputs=config['dense_unit_size'],
                                             activation_fn=tf.nn.leaky_relu, weights_initializer=weights, 
                                              biases_initializer=biases)
    dense = tf.contrib.layers.dropout(dense, keep_prob=config['dropout_dense'])
    predictions = tf.contrib.layers.fully_connected(dense, num_outputs=config['num_classes'],
                                                   activation_fn=tf.sigmoid, 
                                                    weights_initializer=tf.truncated_normal_initializer(mean=0., stddev=.1), 
                                                  biases_initializer=biases)
    return predictions

In [22]:
pred = model()
print(pred)
print(pred.get_shape())

Tensor("fully_connected_1/Sigmoid:0", shape=(?, 2), dtype=float32)
(?, 2)


In [23]:
with tf.variable_scope('setting_training', reuse=tf.AUTO_REUSE):
    y_hats = model()
    
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=target, logits=y_hats))
    
    optimizer = tf.train.AdamOptimizer(learning_rate=config['learning_rate'])
    
    train = optimizer.minimize(cost)
    
    mistakes = tf.not_equal(tf.argmax(target, 1), tf.argmax(y_hats, 1))
    error = tf.reduce_mean(tf.cast(mistakes, tf.float32))

In [24]:
sess = tf.Session()

sess.run(tf.global_variables_initializer())
no_of_batches = int(len(X_train)/config['batch_size'])
avg_tr_loss_per_epoch = []
avg_ts_loss = []
test_error = []
train_error = []
for i in range(config['epoch']):
    print('epoch:',i+1)
    time.sleep(1)
    ptr = 0
    cc = 0
    start_time = datetime.now()
    for j in tqdm(range(no_of_batches), total=no_of_batches):
        inp = X_train[j*config['batch_size']:(j+1)*config['batch_size']]
        out = y_train[j*config['batch_size']:(j+1)*config['batch_size']]
        _, c = sess.run([train, cost], {data_x: inp, target: out})
        cc += c
    avg_tr_loss_per_epoch.append(cc/no_of_batches)
    batch_number = randint(0, no_of_batches)
#     train_error.append(
#         sess.run(error, {
#             data_x: X_train[batch_number*config['batch_size'] : (batch_number+1)*config['batch_size']],
#             target: y_train[batch_number*config['batch_size'] : (batch_number+1)*config['batch_size']]
#         })
#     )
#     test_error.append(sess.run(error, {data_x: X_test, target: y_test}))
    avg_ts_loss.append(
        sess.run(cost, {data_x: X_test, target: y_test})
    )
    end_time = datetime.now()
    print('Train error:',avg_tr_loss_per_epoch[-1])
    print('Test error:',avg_ts_loss[-1])
    print('Time Taken:',str(end_time-start_time))

epoch: 1


  2%|▏         | 7/355 [00:18<15:00,  2.59s/it]

KeyboardInterrupt: 

In [None]:
import pickle
with open('stats.pkl', 'wb') as f:
    pickle.dump((avg_tr_loss_per_epoch, avg_ts_loss, config), f)

In [None]:
import pickle
with open('stats.pkl', 'rb') as f:
    avg_tr_loss, avg_ts_loss, config = pickle.load(f)

In [None]:
import matplotlib.pyplot as plt

def plt_dynamic(x, vy, ty, size, colors=['b']):
    plt.figure(figsize=size)
    plt.plot(x, vy, 'b', label="Validation Loss")
    plt.plot(x, ty, 'r', label="Train Loss")
    plt.legend()
    plt.grid()
    fig.canvas.draw()
    plt.savefig('plot.png')

In [None]:
plt_dynamic(list(range(250)), avg_ts_loss, avg_tr_loss, (10,7))
print(config)