In [6]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation
from collections import Counter

In [2]:

sentiment_data = pd.read_csv('training.csv', sep=',')

In [11]:
sentiment_data.columns =['Class', 'Data']


In [12]:
sentiment_data.head()


Unnamed: 0,Class,Data
0,1,this was the first clive cussler i've ever rea...
1,1,i liked the Da Vinci Code a lot.
2,1,i liked the Da Vinci Code a lot.
3,1,I liked the Da Vinci Code but it ultimatly did...
4,1,that's not even an exaggeration ) and at midni...


In [21]:
unlabeld_data = pd.read_csv('testdata.csv', sep=',')
unlabeld_data.columns = ['Data']


In [22]:
unlabeld_data.head()


Unnamed: 0,Data
0,"harvard is dumb, i mean they really have to be..."
1,I'm loving Shanghai > > > ^ _ ^.
2,harvard is for dumb people.
3,"As i stepped out of my beautiful Toyota, i hea..."
4,"Bodies being dismembered, blown apart, and mut..."


In [23]:
from sklearn.utils import shuffle
sentiment_data = shuffle(sentiment_data)
unlabeld_data = shuffle(unlabeld_data)

In [24]:
sentiment_data.head()


Unnamed: 0,Class,Data
1481,1,Mission Impossible 3 was excellent.
705,1,I love The Da Vinci Code...
6217,0,Brokeback Mountain is fucking horrible..
3572,1,I love Brokeback Mountain.
3578,1,dudeee i LOVED brokeback mountain!!!!


In [25]:
labels = sentiment_data.iloc[:, 0].values
reviews = sentiment_data.iloc[:, 1].values
unlabeled_reviews = unlabeld_data.iloc[:,0].values

In [28]:
print(unlabeled_reviews)

['I absolutely love my MacBook Pro.'
 "And as stupid as San Francisco's road system is, we weren't able to turn back because of how all the roads are one-way streets."
 'TAKE THAT STUPID UCLA!!!!!!!..' ...
 'It was really ironic that he spent the first part of class talking about his own professor at Harvard who was a pompous arrogant ass.'
 'San Francisco was brilliant, Sausalito...'
 "We talked about Purdue's awful music / culture scene, the Flaming Lips ripping off Cat Stevens, Lollapalooza ( he was there too ), stuff like that."]


In [29]:
reviews_processed = []
unlabeled_processed = [] 
for review in reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    reviews_processed.append(review_cool_one)
    
for review in unlabeled_reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    unlabeled_processed.append(review_cool_one)

In [30]:
word_reviews = []
word_unlabeled = []
all_words = []
for review in reviews_processed:
    word_reviews.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())

for review in unlabeled_processed:
    word_unlabeled.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())
    
counter = Counter(all_words)
vocab = sorted(counter, key=counter.get, reverse=True)

In [31]:
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}


In [32]:
reviews_to_ints = []
for review in word_reviews:
    reviews_to_ints.append([vocab_to_int[word] for word in review])

In [33]:
unlabeled_to_ints = []

for review in word_unlabeled:
    unlabeled_to_ints.append([vocab_to_int[word] for word in review])

In [34]:
reviews_lens = Counter([len(x) for x in reviews_to_ints])
print('Zero-length {}'.format(reviews_lens[0]))
print("Max review length {}".format(max(reviews_lens)))

Zero-length 0
Max review length 931


In [35]:
seq_len = 250

features = np.zeros((len(reviews_to_ints), seq_len), dtype=int)
for i, review in enumerate(reviews_to_ints):
    features[i, -len(review):] = np.array(review)[:seq_len]
    
features_test = np.zeros((len(unlabeled_to_ints), seq_len), dtype=int)
for i, review in enumerate(unlabeled_to_ints):
    features_test[i, -len(review):] = np.array(review)[:seq_len]

In [36]:
X_train = features[:6400]
y_train = labels[:6400]

X_test = features[6400:]
y_test = labels[6400:]

X_unlabeled = features_test

print('X_trian shape {}'.format(X_train.shape))
print('X_unlabeled shape {}'.format(X_unlabeled.shape))

X_trian shape (6400, 250)
X_unlabeled shape (28936, 250)


In [40]:
print(X_train)

[[  0   0   0 ... 141  14 254]
 [  0   0   0 ...  40  41  45]
 [  0   0   0 ...   5 153 237]
 ...
 [  0   0   0 ...   1  49 223]
 [  0   0   0 ...  22   6 433]
 [  0   0   0 ...  44  14  31]]


In [89]:
hidden_layer_size = 512 # how many nodes LSTM cells will have
number_of_layers = 1 # how many RNN layers the network will use
batch_size = 100 # how many reviews we feed at onces
learning_rate = 0.002 # learning rate
number_of_words = len(vocab_to_int) + 1 #how many unique words do we have in vocab (+1  is used for 0 - padding)
dropout_rate = 0.8 
embed_size = 200 #how long our word embedings will be
epochs = 4 # how many epochs do we use for training

In [90]:
tf.reset_default_graph() #Clean the graph


In [91]:
inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
targets = tf.placeholder(tf.int32, [None, None], name='targets')

In [92]:
word_embedings = tf.Variable(tf.random_uniform((number_of_words, embed_size), -1, 1))
embed = tf.nn.embedding_lookup(word_embedings, inputs)

In [93]:
hidden_layer = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)
hidden_layer = tf.contrib.rnn.DropoutWrapper(hidden_layer, dropout_rate)

cell = tf.contrib.rnn.MultiRNNCell([hidden_layer]*number_of_layers)
init_state = cell.zero_state(batch_size, tf.float32)

In [94]:
outputs, states = tf.nn.dynamic_rnn(cell, embed, initial_state=init_state)


In [95]:
prediction = tf.layers.dense(outputs[:, -1], 1, activation=tf.sigmoid)
cost = tf.losses.mean_squared_error(targets, prediction)

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [96]:
currect_pred = tf.equal(tf.cast(tf.round(prediction), tf.int32), targets)
accuracy = tf.reduce_mean(tf.cast(currect_pred, tf.float32))

In [97]:
session = tf.Session()


In [98]:
session.run(tf.global_variables_initializer())


In [99]:
for i in range(epochs):
    training_accurcy = []
    ii = 0
    epoch_loss = []
    while ii + batch_size <= len(X_train):
        X_batch = X_train[ii:ii+batch_size]
        y_batch = y_train[ii:ii+batch_size].reshape(-1, 1)
        
        a, o, _ = session.run([accuracy, cost, optimizer], feed_dict={inputs:X_batch, targets:y_batch})

        training_accurcy.append(a)
        epoch_loss.append(o)
        ii += batch_size
    print('Epoch: {}/{}'.format(i, epochs), ' | Current loss: {}'.format(np.mean(epoch_loss)),
          ' | Training accuracy: {:.4f}'.format(np.mean(training_accurcy)*100))

Epoch: 0/4  | Current loss: 0.048528365790843964  | Training accuracy: 93.2188
Epoch: 1/4  | Current loss: 0.009912846609950066  | Training accuracy: 98.6719
Epoch: 2/4  | Current loss: 0.003241236787289381  | Training accuracy: 99.6250
Epoch: 3/4  | Current loss: 0.0035140272229909897  | Training accuracy: 99.5312


In [100]:
test_accuracy = []

ii = 0
while ii + batch_size <= len(X_test):
    X_batch = X_test[ii:ii+batch_size]
    y_batch = y_test[ii:ii+batch_size].reshape(-1, 1)

    a = session.run([accuracy], feed_dict={inputs:X_batch, targets:y_batch})
    
    test_accuracy.append(a)
    ii += batch_size

In [101]:
print("Test accuracy is {:.4f}%".format(np.mean(test_accuracy)*100))


Test accuracy is 98.6000%


In [102]:
predictions_unlabeled = []
ii = 0
while ii + batch_size <= len(X_unlabeled):
    if ii + batch_size > len(X_unlabeled):
        batch_size = len(X_unlabeled) - ii
    X_batch = X_unlabeled[ii:ii+batch_size]
    y_batch = X_unlabeled[ii:ii+batch_size].reshape(-1, 1)

    pred = session.run([prediction], feed_dict={inputs:X_batch, targets:y_batch})
    
    predictions_unlabeled.append(pred)
    ii += batch_size

In [103]:
pred_real = []
for i in range(len(predictions_unlabeled)):
    for ii in range(len(predictions_unlabeled[i][0])):
        if predictions_unlabeled[i][0][ii][0] >= 0.5:
            pred_real.append(1)
        else:
            pred_real.append(0)

In [104]:
np.savetxt('predictions.txt', pred_real)


In [105]:
new_dataframe = unlabeld_data[:len(pred_real)]


In [106]:
new_dataframe['Classes'] = pred_real


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [107]:
new_dataframe


Unnamed: 0,Data,Classes
16955,I absolutely love my MacBook Pro.,1
17408,And as stupid as San Francisco's road system i...,0
16562,TAKE THAT STUPID UCLA!!!!!!!..,0
8628,The piccs liked our dances = ) I ’ m a little ...,1
27905,we need at least ONE beautiful girl at ucla.,1
6941,and honda elements are assholes...,0
14831,I hate San Francisco for that.,0
17083,I loved the Lakers episode where he told that ...,1
1045,I love Tom Cruise!,1
25674,"damn, seattle sucks.",0
