In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation
from collections import Counter
%matplotlib inline

In [3]:
sentiment_data = pd.read_csv('againdata.csv')
sentiment_data.columns =['Class', 'Data']

In [4]:
sentiment_data.head()

Unnamed: 0,Class,Data
0,0,খুব খারাপ লাগলো
1,0,বর্তমান পৃথিবীর সব থেকে বড় বর্বর জাতি মিয়ানমার
2,0,ভালো মানুদের এভাবে মরতে হয়না
3,0,মন টা খারাপ হয়ে গেলো
4,0,আমাৱ মতে এখন ওদেৱ উপৱ হামলা কৱা হুক


In [5]:
from sklearn.utils import shuffle
sentiment_data = shuffle(sentiment_data)

In [6]:
sentiment_data.head()

Unnamed: 0,Class,Data
1118,0,বাংলাদেশের সাংবাদিকেরা ভারতের দালালভারতের ষড়যন...
824,0,ঈদের যে কি খুশি তোরা বুঝতি যদি ধর্ম ভীরু হতি ...
8050,1,"কি আর বলব,কোন ভাষা খুজে পাচ্ছি না। তবে দোয়া কর..."
6532,1,সত্যিই অসাধারন.﻿
2903,0,অতিরিক্ত আবেদন আর আম্পায়ারর সিদ্ধান্তের বিরোধি...


In [7]:
labels = sentiment_data.iloc[:, 0].values
reviews = sentiment_data.iloc[:, 1].values

In [8]:
print(labels)

[0 0 1 ..., 0 1 0]


In [10]:
reviews_processed = []
for review in reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    reviews_processed.append(review_cool_one)

In [11]:
word_reviews = []

all_words = []
for review in reviews_processed:
    word_reviews.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())


    
counter = Counter(all_words)
vocab = sorted(counter, key=counter.get, reverse=True)

In [12]:
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}

In [13]:
reviews_to_ints = []
for review in word_reviews:
    reviews_to_ints.append([vocab_to_int[word] for word in review])

In [14]:
reviews_lens = Counter([len(x) for x in reviews_to_ints])
print('Zero-length {}'.format(reviews_lens[0]))
print("Max review length {}".format(max(reviews_lens)))

Zero-length 0
Max review length 447


In [15]:
sentiment_data.isnull().values.any()

False

In [16]:

seq_len = 150

features = np.zeros((len(reviews_to_ints), seq_len), dtype=int)
for i, review in enumerate(reviews_to_ints):
    features[i, -len(review):] = np.array(review)[:seq_len]

In [17]:
X_train = features[:5400]
y_train = labels[:5400]

X_test = features[5400:]
y_test = labels[5400:]



print('X_trian shape {}'.format(X_train.shape))


X_trian shape (5400, 150)


In [20]:
hidden_layer_size = 200 # how many nodes LSTM cells will have
number_of_layers = 2 # how many RNN layers the network will use
batch_size = 100 # how many reviews we feed at onces
learning_rate = 0.001 # learning rate
number_of_words = len(vocab_to_int) + 1 #how many unique words do we have in vocab (+1  is used for 0 - padding)
dropout_rate = 0.8 
embed_size = 300 #how long our word embedings will be
epochs = 10 # how many epochs do we use for training

tf.reset_default_graph() #Clean the graph

In [21]:

inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
targets = tf.placeholder(tf.int32, [None, None], name='targets')

In [22]:
word_embedings = tf.Variable(tf.random_uniform((number_of_words, embed_size), -1, 1))
embed = tf.nn.embedding_lookup(word_embedings, inputs)


hidden_layer1 = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)
hidden_layer1 = tf.contrib.rnn.DropoutWrapper(hidden_layer1, dropout_rate)

hidden_layer2 = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)
hidden_layer2 = tf.contrib.rnn.DropoutWrapper(hidden_layer2, dropout_rate)

cell = tf.contrib.rnn.MultiRNNCell([hidden_layer]*number_of_layers)
init_state = cell.zero_state(batch_size, tf.float32)

outputs, states = tf.nn.dynamic_rnn(cell, embed, initial_state=init_state)

prediction = tf.layers.dense(outputs[:, -1], 1, activation=tf.sigmoid)
cost = tf.losses.mean_squared_error(targets, prediction)

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

currect_pred = tf.equal(tf.cast(tf.round(prediction), tf.int32), targets)
accuracy = tf.reduce_mean(tf.cast(currect_pred, tf.float32))

session = tf.Session()

#writer = tf.train.SummaryWriter("logs/", sess.graph)

session.run(tf.global_variables_initializer())

NameError: name 'hidden_layer' is not defined

In [106]:
for i in range(epochs):
    training_accurcy = []
    ii = 0
    epoch_loss = []
    while ii + batch_size <= len(X_train):
        X_batch = X_train[ii:ii+batch_size]
        y_batch = y_train[ii:ii+batch_size].reshape(-1, 1)
        
        a, o, _ = session.run([accuracy, cost, optimizer], feed_dict={inputs:X_batch, targets:y_batch})

        training_accurcy.append(a)
        epoch_loss.append(o)
        ii += batch_size
    print('Epoch: {}/{}'.format(i, epochs), ' | Current loss: {}'.format(np.mean(epoch_loss)),
          ' | Training accuracy: {:.4f}'.format(np.mean(training_accurcy)*100))

Epoch: 0/10  | Current loss: 0.2015477418899536  | Training accuracy: 68.2857
Epoch: 1/10  | Current loss: 0.13367517292499542  | Training accuracy: 81.1857
Epoch: 2/10  | Current loss: 0.09314392507076263  | Training accuracy: 87.7143
Epoch: 3/10  | Current loss: 0.06746184080839157  | Training accuracy: 91.1857
Epoch: 4/10  | Current loss: 0.05973125994205475  | Training accuracy: 92.7286
Epoch: 5/10  | Current loss: 0.035663556307554245  | Training accuracy: 95.9286
Epoch: 6/10  | Current loss: 0.02475590445101261  | Training accuracy: 97.0571
Epoch: 7/10  | Current loss: 0.017523370683193207  | Training accuracy: 98.1429
Epoch: 8/10  | Current loss: 0.012213785201311111  | Training accuracy: 98.6571
Epoch: 9/10  | Current loss: 0.010493193753063679  | Training accuracy: 98.9000


In [96]:
test_accuracy = []

ii = 0
while ii + batch_size <= len(X_test):
    X_batch = X_test[ii:ii+batch_size]
    y_batch = y_test[ii:ii+batch_size].reshape(-1, 1)

    a = session.run([accuracy], feed_dict={inputs:X_batch, targets:y_batch})
    
    test_accuracy.append(a)
    ii += batch_size

In [97]:
print("Test accuracy is {:.4f}%".format(np.mean(test_accuracy)*100))

Test accuracy is 77.9048%
