In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation
from collections import Counter
import re
import string
from nltk.corpus import stopwords
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# data source: (https://www.kaggle.com/c/si650winter11)
train_data = pd.read_csv('/home/ramscrux7757/SPARK/SENT_ANALS/UMICH/Train.txt', sep='\t')

In [3]:
train_data.head()

Unnamed: 0,1,The Da Vinci Code book is just awesome.
0,1,this was the first clive cussler i've ever rea...
1,1,i liked the Da Vinci Code a lot.
2,1,i liked the Da Vinci Code a lot.
3,1,I liked the Da Vinci Code but it ultimatly did...
4,1,that's not even an exaggeration ) and at midni...


In [4]:
train_data.columns = ['Class','Review']
train_data.head()

Unnamed: 0,Class,Review
0,1,this was the first clive cussler i've ever rea...
1,1,i liked the Da Vinci Code a lot.
2,1,i liked the Da Vinci Code a lot.
3,1,I liked the Da Vinci Code but it ultimatly did...
4,1,that's not even an exaggeration ) and at midni...


In [5]:
test_data = pd.read_csv('/home/ramscrux7757/SPARK/SENT_ANALS/UMICH/Test.txt', sep='\t')
test_data.columns = ['Review']
test_data.head()

Unnamed: 0,Review
0,"harvard is dumb, i mean they really have to be..."
1,I'm loving Shanghai > > > ^ _ ^.
2,harvard is for dumb people.
3,"As i stepped out of my beautiful Toyota, i hea..."
4,"Bodies being dismembered, blown apart, and mut..."


In [6]:
# Shuffling the data as currently its being arranged by class
from sklearn.utils import shuffle
train_data = shuffle(train_data)
test_data = shuffle(test_data)

In [7]:
# separating the lables and Review data
train_labels = train_data['Class'].values
train_rev_data = train_data['Review'].values
test_rev_data = test_data['Review'].values

In [8]:
# concatenating train and test reviews to generate Vocabulary
print(train_data['Review'].shape)
print(test_data['Review'].shape)
train_rev = train_data['Review']
test_rev = test_data['Review']
train_test_vocab = pd.concat([train_rev, test_rev], axis=0)
print(train_test_vocab.shape)

(6917,)
(28936,)
(35853,)


# pre-process the data (could be cleaned in a better way, but the following is just for the demo)

In [9]:
# The following creates the word_to_index mappings
def tokenizer(data_doc):
    t = Tokenizer()
    t.fit_on_texts(data_doc)
    # creates the word_to_index dictionary 
    #stop_words = set(stopwords.words('english'))
    #t.word_index = {k:v for k, v in t.word_index.items() if k not in stop_words}
    #t.word_index = {k:v for k, v in t.word_index.items() if len(k) > 2}
    #print(t.word_index)

    vocab_size = len(t.word_index) + 1
    print('vocab_size: {}'.format(vocab_size))
    #return vocab_size

    # integer encode the documents
    # creates the integer vectors based on the words in the sentences
    encoded_docs = t.texts_to_sequences(data_doc)
    return vocab_size, encoded_docs
    #print(encoded_docs)   

In [10]:
vocab_size, train_test_enc_docs = tokenizer(train_test_vocab)
train_enc_docs = train_test_enc_docs[:6917]
test_enc_docs = train_test_enc_docs[6917:]
print(len(train_enc_docs))
print(len(test_enc_docs))

vocab_size: 10722
6917
28936


In [11]:
# identifying the max length of the sequences followed by the padding of the sequences
def seq_max_length(enc_doc, train_test_enc_docs):
    max_length = max([len(doc) for doc in train_test_enc_docs])
    #print('train_max_length: {}'.format(train_max_length))
    padded_docs = pad_sequences(enc_doc, maxlen=max_length, padding = 'post')
    print(padded_docs.shape)
    return padded_docs

In [12]:
train_padded_docs = seq_max_length(train_enc_docs, train_test_enc_docs)
test_padded_docs = seq_max_length(test_enc_docs, train_test_enc_docs)

(6917, 1891)
(28936, 1891)


In [13]:
# train-test splittings
X_train = train_padded_docs[:6400]
Y_train = train_labels[:6400]
X_test = train_padded_docs[6400:]
Y_test = train_labels[6400:]

print(X_train.shape)

(6400, 1891)


In [14]:
# subsampling the data for simplicity
X_train = X_train[:1000]
Y_train = Y_train[:1000]
X_test = X_test[:400]
Y_test = Y_test[:400]
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(1000, 1891)
(1000,)
(400, 1891)
(400,)


In [15]:
# Defining RNN
hidden_layer_size = 10 # 512 # how many nodes LSTM cells will have
number_of_layers = 1 # how many RNN layers the network will use
batch_size = 100 #100 # how many reviews we feed at onces
learning_rate = 0.001 # learning rate
number_of_words = vocab_size + 1
#number_of_words = len(vocab_to_int) + 1 #how many unique words do we have in vocab (+1  is used for 0 - padding)
dropout_rate = 0.8 
embed_size = 300 #how long our word embedings will be
epochs = 2 # how many epochs do we use for training

In [16]:
# clean the existing graph if any
tf.reset_default_graph()

In [17]:
# Define the place holders
inputs = tf.placeholder(tf.int32, [None, None], name='inputs') # batch-size/n_inputs and time_steps/n_steps
targets = tf.placeholder(tf.int32, [None, None], name='targets') # batch_size and num_classes

In [18]:
# Define embedding layer
word_embedings = tf.Variable(tf.random_uniform((number_of_words, embed_size), -1, 1)) # number_of_words = vocab_size + 1
embed = tf.nn.embedding_lookup(word_embedings, inputs)
# in Keras
# model.add(Embedding(vocab_size2, 100, weights=[embedding_matrix], input_length=4, trainable=False))

In [19]:
# Define the network (hidden layer and Dynamic RNN)

def LSTM_dynamic_RNN(number_of_layers, hidden_layer_size, dropout_rate, batch_size, embed):
    
    hidden_layer = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)
    hidden_layer = tf.contrib.rnn.DropoutWrapper(hidden_layer, dropout_rate)

    cell = tf.contrib.rnn.MultiRNNCell([hidden_layer]*number_of_layers)
    init_state = cell.zero_state(batch_size, tf.float32)

    # feeding the basic LSTM cell into the dynamic RNN
    outputs, states = tf.nn.dynamic_rnn(cell, embed, initial_state = init_state)
    # here the 'cell' could also be the 'hidden_layer' or simply the 'BasicRNNCell'
    # initial_state could be the 'inputs'
    # for static, replace 'tf.nn.dynamic_rnn' with 'tf.contrib.rnn.static_rnn'

    # Get the prediction for each row / sequence classifier
    prediction = tf.layers.dense(outputs[:, -1], 1, activation=tf.sigmoid)
    return prediction
    #return outputs

In [20]:
prediction = LSTM_dynamic_RNN(number_of_layers, hidden_layer_size, dropout_rate, batch_size, embed)

In [21]:
# Define Loss and Optimizer
cost = tf.losses.mean_squared_error(targets, prediction)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [22]:
# Define the accuracy
currect_pred = tf.equal(tf.cast(tf.round(prediction), tf.int32), targets)
accuracy = tf.reduce_mean(tf.cast(currect_pred, tf.float32))

In [23]:
# Initializing the variables
init = tf.global_variables_initializer()

In [24]:
# Launch the graph (training, validation and prediction)
# here, training, validation and predictions are put within one cell which is inconvinient.
# instead, put
# session = tf.Session()
# session.run(tf.global_variables_initializer())
# in separate cells and separate the training, validation and predictions 
# precisely, we are not using 'with...'

display_step = 1

with tf.Session() as sess:
    sess.run(init)
    # Training cycle
    for epoch in range(epochs):
        training_acc = []
        training_cost = []
        #avg_cost = 0.
        total_batch = int(len(X_train)/batch_size)

        X_batches = np.array_split(X_train, total_batch)
        Y_batches = np.array_split(Y_train, total_batch)
        # Loop over all batches
        for i in range(total_batch):
            batch_x, batch_y = X_batches[i], Y_batches[i].reshape(-1,1)
            # Run optimization op (backprop) and cost op (to get loss value)
            a, c, _ = sess.run([accuracy, cost, optimizer], feed_dict={inputs: batch_x, targets: batch_y})
        
            #----------------------------------------------------------
            # in Keras
            # model.fit(x_train,y_train,epochs=5, batch_size=32)
            #-----------------------------------------------------------
            # Compute average loss
            training_acc.append(a)
            #training_cost.append(c)
            
        # Display logs per epoch step
        if epoch % display_step == 0:
            
           print('Epoch: {}/{}'.format(epoch+1, epochs), ' | Current loss: {}'.format(np.mean(training_cost)),
            ' | Training accuracy: {:.4f}'.format(np.mean(training_acc)*100))

    print("Optimization Finished!")
    print
    #------------------------------------------------------------
    print('Validation Step')
    
    # validation / Test set

    val_acc = []

    val_batch = int(len(X_test)/batch_size)
    X_val_batches = np.array_split(X_test, val_batch)
    Y_val_batches = np.array_split(Y_test, val_batch)
        # Loop over all batches
    for i in range(val_batch):
        batch_x_val, batch_y_val = X_val_batches[i], Y_val_batches[i].reshape(-1,1)
    # Run optimization op (backprop) and cost op (to get loss value)
    # see no 'optimizer' here
        a = sess.run([accuracy], feed_dict={inputs: batch_x_val, targets: batch_y_val})

            #----------------------------------------------------------
            # in Keras
            # model.fit(x_train,y_train,epochs=5, batch_size=32)
            #-----------------------------------------------------------
            # Compute average loss
        val_acc.append(a)
    
    print("Test accuracy is {:.4f}%".format(np.mean(val_acc)*100))
    
    # predictions
    
    # Testing on the unlabelled data
    predictions_unlabeled = []
    ii = 0
    while ii + batch_size <= len(X_unlabeled):
    if ii + batch_size > len(X_unlabeled):
        batch_size = len(X_unlabeled) - ii
    X_batch = X_unlabeled[ii:ii+batch_size]

    pred = session.run([prediction], feed_dict={inputs:X_batch})
    
    predictions_unlabeled.append(pred)
    ii += batch_size
    
    print('Predictions are complete !!!')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


('Epoch: 1/2', ' | Current loss: nan', ' | Training accuracy: 57.1000')
('Epoch: 2/2', ' | Current loss: nan', ' | Training accuracy: 57.5000')
Optimization Finished!

Validation Step
Test accuracy is 55.5000%
Predictions are complete !!!


In [26]:
pred_real = []
for i in range(len(predictions_unlabeled)):
    for ii in range(len(predictions_unlabeled[i][0])):
        if predictions_unlabeled[i][0][ii][0] >= 0.5:
            pred_real.append(1)
        else:
            pred_real.append(0)

In [31]:
pred_real[0:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [32]:
np.savetxt('predictions.txt', pred_real)
new_dataframe = test_data[:len(pred_real)]
new_dataframe['Classes'] = pred_real
new_dataframe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,Review,Classes
27512,Quality cheese and uber-sexy Angelina Jolie...,1
23152,Paris Hilton and Sexy Cop.......,1
26437,And as stupid as San Francisco's road system i...,1
14391,You are a fucking bitch and I think I may hate...,1
4722,ps i LOVE toyota and yeh you need a HIS AND HE...,1
9448,"in simple words, in seattle Bank of America is...",1
18451,stupid lakers should have beat suns.....,1
20140,"I like being at Purdue, not with my family.",1
7672,"On the upside, apparently Toyota Siennas are g...",1
11229,I need some of that geico balboa stuff..,1
