In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation
from collections import Counter
%matplotlib inline

In [65]:
sentiment_data = pd.read_csv('againdata.csv')
sentiment_data.columns =['Class', 'Data']

In [66]:
sentiment_data.head()

Unnamed: 0,Class,Data
0,0,খুব খারাপ লাগলো
1,0,বর্তমান পৃথিবীর সব থেকে বড় বর্বর জাতি মিয়ানমার
2,0,ভালো মানুদের এভাবে মরতে হয়না
3,0,মন টা খারাপ হয়ে গেলো
4,0,আমাৱ মতে এখন ওদেৱ উপৱ হামলা কৱা হুক


In [67]:
from sklearn.utils import shuffle
sentiment_data = shuffle(sentiment_data)

In [68]:
sentiment_data.head()

Unnamed: 0,Class,Data
2520,0,না না এটা ভুয়া খবর
1842,0,শুরুতে ঝামেলা হলে শেষ পর্যন্ত তো তা এড়ানো মুশকিল
7577,1,শুভকামনা
5413,1,সত্যিকারের গনমানুষের নেতা এর্দোগান।
6718,1,অভিজ্ঞতা ই জ্ঞান এর উৎস! প্রত্যেক মূহুর্ত ই শি...


In [69]:
labels = sentiment_data.iloc[:, 0].values
reviews = sentiment_data.iloc[:, 1].values

In [70]:
reviews_processed = []
for review in reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    reviews_processed.append(review_cool_one)

In [71]:
word_reviews = []

all_words = []
for review in reviews_processed:
    word_reviews.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())


    
counter = Counter(all_words)
vocab = sorted(counter, key=counter.get, reverse=True)

In [72]:
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}

In [73]:
reviews_to_ints = []
for review in word_reviews:
    reviews_to_ints.append([vocab_to_int[word] for word in review])

In [74]:
reviews_lens = Counter([len(x) for x in reviews_to_ints])
print('Zero-length {}'.format(reviews_lens[0]))
print("Max review length {}".format(max(reviews_lens)))

Zero-length 0
Max review length 447


In [75]:
sentiment_data.isnull().values.any()

False

In [76]:

seq_len = 250

features = np.zeros((len(reviews_to_ints), seq_len), dtype=int)
for i, review in enumerate(reviews_to_ints):
    features[i, -len(review):] = np.array(review)[:seq_len]

In [77]:
X_train = features[:6400]
y_train = labels[:6400]

X_test = features[6400:]
y_test = labels[6400:]



print('X_trian shape {}'.format(X_train.shape))


X_trian shape (6400, 250)


In [78]:
hidden_layer_size = 512 # how many nodes LSTM cells will have
number_of_layers = 1 # how many RNN layers the network will use
batch_size = 100 # how many reviews we feed at onces
learning_rate = 0.001 # learning rate
number_of_words = len(vocab_to_int) + 1 #how many unique words do we have in vocab (+1  is used for 0 - padding)
dropout_rate = 0.8 
embed_size = 300 #how long our word embedings will be
epochs = 6 # how many epochs do we use for training

In [79]:
tf.reset_default_graph() #Clean the graph

In [80]:
inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
targets = tf.placeholder(tf.int32, [None, None], name='targets')

In [81]:
word_embedings = tf.Variable(tf.random_uniform((number_of_words, embed_size), -1, 1))
embed = tf.nn.embedding_lookup(word_embedings, inputs)

In [82]:
hidden_layer = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)
hidden_layer = tf.contrib.rnn.DropoutWrapper(hidden_layer, dropout_rate)

cell = tf.contrib.rnn.MultiRNNCell([hidden_layer]*number_of_layers)
init_state = cell.zero_state(batch_size, tf.float32)


In [83]:
outputs, states = tf.nn.dynamic_rnn(cell, embed, initial_state=init_state)

In [84]:
prediction = tf.layers.dense(outputs[:, -1], 1, activation=tf.sigmoid)
cost = tf.losses.mean_squared_error(targets, prediction)

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [85]:
currect_pred = tf.equal(tf.cast(tf.round(prediction), tf.int32), targets)
accuracy = tf.reduce_mean(tf.cast(currect_pred, tf.float32))

In [86]:
session = tf.Session()

In [87]:
session.run(tf.global_variables_initializer())

In [88]:
for i in range(epochs):
    training_accurcy = []
    ii = 0
    epoch_loss = []
    while ii + batch_size <= len(X_train):
        X_batch = X_train[ii:ii+batch_size]
        y_batch = y_train[ii:ii+batch_size].reshape(-1, 1)
        
        a, o, _ = session.run([accuracy, cost, optimizer], feed_dict={inputs:X_batch, targets:y_batch})

        training_accurcy.append(a)
        epoch_loss.append(o)
        ii += batch_size
    print('Epoch: {}/{}'.format(i, epochs), ' | Current loss: {}'.format(np.mean(epoch_loss)),
          ' | Training accuracy: {:.4f}'.format(np.mean(training_accurcy)*100))

Epoch: 0/6  | Current loss: 0.2080979198217392  | Training accuracy: 66.9375
Epoch: 1/6  | Current loss: 0.1417398452758789  | Training accuracy: 79.9844
Epoch: 2/6  | Current loss: 0.10530947148799896  | Training accuracy: 86.5781
Epoch: 3/6  | Current loss: 0.07047310471534729  | Training accuracy: 91.0937
Epoch: 4/6  | Current loss: 0.049688003957271576  | Training accuracy: 94.0156
Epoch: 5/6  | Current loss: 0.03608681261539459  | Training accuracy: 95.7656


In [89]:
test_accuracy = []

ii = 0
while ii + batch_size <= len(X_test):
    X_batch = X_test[ii:ii+batch_size]
    y_batch = y_test[ii:ii+batch_size].reshape(-1, 1)

    a = session.run([accuracy], feed_dict={inputs:X_batch, targets:y_batch})
    
    test_accuracy.append(a)
    ii += batch_size

In [90]:
print("Test accuracy is {:.4f}%".format(np.mean(test_accuracy)*100))

Test accuracy is 78.9524%
