In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation
from collections import Counter
%matplotlib inline

In [None]:
sentiment_data = pd.read_csv('mydatasets/kaggle_sentiment1.csv',sep='\t',encoding='latin-1')
sentiment_data.columns =['Class', 'Data']

In [None]:
unlabeld_data = pd.read_csv('mydatasets/unlabeld_data.txt', sep='\t',encoding='latin-1')
unlabeld_data.columns = ['Data']

In [None]:
sentiment_data

In [None]:
unlabeld_data

In [None]:
from sklearn.utils import shuffle
sentiment_data = shuffle(sentiment_data)
unlabeld_data = shuffle(unlabeld_data)

In [None]:
sentiment_data.head()

In [None]:
labels = sentiment_data.iloc[:, 0].values
reviews = sentiment_data.iloc[:, 1].values
unlabeled_reviews = unlabeld_data.iloc[:,0].values

In [None]:
reviews_processed = []
unlabeled_processed = [] 
for review in reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    reviews_processed.append(review_cool_one)
    
for review in unlabeled_reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    unlabeled_processed.append(review_cool_one)

In [None]:
word_reviews = []
word_unlabeled = []
all_words = []
for review in reviews_processed:
    word_reviews.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())

for review in unlabeled_processed:
    word_unlabeled.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())
    
counter = Counter(all_words)
vocab = sorted(counter, key=counter.get, reverse=True)

In [None]:
print(vocab)

In [None]:
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}

In [None]:
print(vocab_to_int)

In [None]:
reviews_to_ints = []
for review in word_reviews:
    reviews_to_ints.append([vocab_to_int[word] for word in review])

In [None]:
print(reviews_to_ints)

In [None]:
unlabeled_to_ints = []

for review in word_unlabeled:
    unlabeled_to_ints.append([vocab_to_int[word] for word in review])

In [None]:
reviews_lens = Counter([len(x) for x in reviews_to_ints])
print('Zero-length {}'.format(reviews_lens[0]))
print("Max review length {}".format(max(reviews_lens)))

In [None]:
seq_len = 250

features = np.zeros((len(reviews_to_ints), seq_len), dtype=int)
for i, review in enumerate(reviews_to_ints):
    features[i, -len(review):] = np.array(review)[:seq_len]
    
features_test = np.zeros((len(unlabeled_to_ints), seq_len), dtype=int)
for i, review in enumerate(unlabeled_to_ints):
    features_test[i, -len(review):] = np.array(review)[:seq_len]

In [None]:
print(features)

In [None]:
print(features_test)

In [None]:
X_train = features[:6400]
y_train = labels[:6400]

X_test = features[6400:]
y_test = labels[6400:]

X_unlabeled = features_test

print('X_trian shape {}'.format(X_train.shape))
print('y_trian shape {}'.format(y_train.shape))
print('X_test shape {}'.format(X_test.shape))
print('y_test shape {}'.format(y_test.shape))
print('X_unlabeled shape {}'.format(X_unlabeled.shape))

In [None]:
hidden_layer_size = 512 # how many nodes LSTM cells will have
number_of_layers = 1 # how many RNN layers the network will use
batch_size = 100 # how many reviews we feed at onces
learning_rate = 0.01 # learning rate
number_of_words = len(vocab_to_int) + 1 #how many unique words do we have in vocab (+1  is used for 0 - padding)
dropout_rate = 0.8 
embed_size = 300 #how long our word embedings will be
epochs = 6 # how many epochs do we use for training

In [None]:
tf.reset_default_graph() #Clean the graph

In [None]:
inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
targets = tf.placeholder(tf.int32, [None, None], name='targets')

In [None]:
word_embedings = tf.Variable(tf.random_uniform((number_of_words, embed_size), -1, 1))
embed = tf.nn.embedding_lookup(word_embedings, inputs)

In [None]:
hidden_layer = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)
hidden_layer = tf.contrib.rnn.DropoutWrapper(hidden_layer, dropout_rate)

cell = tf.contrib.rnn.MultiRNNCell([hidden_layer]*number_of_layers)
init_state = cell.zero_state(batch_size, tf.float32)

In [None]:
outputs, states = tf.nn.dynamic_rnn(cell, embed, initial_state=init_state)

In [None]:
prediction = tf.layers.dense(outputs[:, -1], 1, activation=tf.sigmoid)
cost = tf.losses.mean_squared_error(targets, prediction)

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [None]:
currect_pred = tf.equal(tf.cast(tf.round(prediction), tf.int32), targets)
accuracy = tf.reduce_mean(tf.cast(currect_pred, tf.float32))

In [None]:
session = tf.Session()

In [None]:
session.run(tf.global_variables_initializer())

In [None]:
for i in range(epochs):
    training_accurcy = []
    ii = 0
    epoch_loss = []
    while ii + batch_size <= len(X_train):
        X_batch = X_train[ii:ii+batch_size]
        y_batch = y_train[ii:ii+batch_size].reshape(-1, 1)
        a= session.run([accuracy], feed_dict={inputs:X_batch, targets:y_batch})
        training_accurcy.append(a)
        ii += batch_size
    print('Epoch: {}/{}'.format(i, epochs), ' | Training accuracy: {:.4f}'.format(np.mean(training_accurcy)*100))

In [None]:
for j in range(epochs):
    test_accuracy = []
    ii = 0
    while ii + batch_size <= len(X_test):
        X_batch = X_test[ii:ii+batch_size]
        y_batch = y_test[ii:ii+batch_size].reshape(-1, 1)
        a = session.run([accuracy], feed_dict={inputs:X_batch, targets:y_batch})
        test_accuracy.append(a)
        ii += batch_size
    print("Test accuracy is {:.4f}%".format(np.mean(test_accuracy)*100))

In [None]:
predictions_unlabeled = []
ii = 0
while ii + batch_size <= len(X_unlabeled):
    if ii + batch_size > len(X_unlabeled):
        batch_size = len(X_unlabeled) - ii
    X_batch = X_unlabeled[ii:ii+batch_size]
    y_batch = X_unlabeled[ii:ii+batch_size].reshape(-1, 1)
    pred = session.run([prediction], feed_dict={inputs:X_batch, targets:y_batch})
    predictions_unlabeled.append(pred)
    ii += batch_size

In [None]:
print("Test accuracy is {:.4f}%".format(np.mean(predictions_unlabeled)*100))

In [None]:
pred_real = []
for i in range(len(predictions_unlabeled)):
    for ii in range(len(predictions_unlabeled[i][0])):
        if predictions_unlabeled[i][0][ii][0] >= 0.5:
            pred_real.append(1)
        else:
            pred_real.append(0)

In [None]:
np.savetxt('predictions.txt', pred_real)

In [None]:
new_dataframe = unlabeld_data[:len(pred_real)]

In [None]:
new_dataframe['Classes'] = pred_real

In [None]:
new_dataframe