The dataset used in this nootebook can be found [here](https://inclass.kaggle.com/c/si650winter11)

In [3]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation
from collections import Counter
%matplotlib inline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
sentiment_data = pd.read_csv('training.txt', sep='\t')
sentiment_data.columns =['Class', 'Data']

In [5]:
unlabeld_data = pd.read_csv('testdata.txt', sep='\t')
unlabeld_data.columns = ['Data']

### Step 1. Preprocessing pipeline

In [6]:
sentiment_data.head()

Unnamed: 0,Class,Data
0,1,this was the first clive cussler i've ever rea...
1,1,i liked the Da Vinci Code a lot.
2,1,i liked the Da Vinci Code a lot.
3,1,I liked the Da Vinci Code but it ultimatly did...
4,1,that's not even an exaggeration ) and at midni...


In [7]:
unlabeld_data.head()

Unnamed: 0,Data
0,"harvard is dumb, i mean they really have to be..."
1,I'm loving Shanghai > > > ^ _ ^.
2,harvard is for dumb people.
3,"As i stepped out of my beautiful Toyota, i hea..."
4,"Bodies being dismembered, blown apart, and mut..."


#### Step 1.1 Shuffle dataframe

The dataset is well sorted. First, we have half of data samples that are positive and then half of them negative. If we separate the dataset to training and testing parts like this, we will have most of the data (if not all) from one class. To prevent that from happening, we will shuffle the dataset first.

In [10]:
from sklearn.utils import shuffle
sentiment_data = shuffle(sentiment_data)
unlabeld_data = shuffle(unlabeld_data)

In [11]:
sentiment_data.head()

Unnamed: 0,Class,Data
3762,1,I love Brokeback Mountain.
775,1,I love The Da Vinci Code...
5595,0,"I hate Harry Potter, that daniel wotshisface n..."
3217,1,man i loved brokeback mountain!
1136,1,"Mission Impossible III, and it was awesome!"


#### Step 1.2 Split to labels and reviews

In this step we need to create separated variables that will hold labels (positive or negative) and reviews.

In [12]:
labels = sentiment_data.iloc[:, 0].values
reviews = sentiment_data.iloc[:, 1].values
unlabeled_reviews = unlabeld_data.iloc[:,0].values

#### Step 1.3 Clean data from punctuation

The punctuation won't effect our prediction so we will delete all punctuation from reviews.

In [13]:
reviews_processed = []
unlabeled_processed = [] 
for review in reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    reviews_processed.append(review_cool_one)
    
for review in unlabeled_reviews:
    review_cool_one = ''.join([char for char in review if char not in punctuation])
    unlabeled_processed.append(review_cool_one)

#### Step 1.4 Creating vocabulary, coverting all characters to lower case and spliting each review into words

In this step we are creating vocabulary which will be created by using function Counter. Also in this step we will lower all characters in the dataset, we can do this as well because lower/upper case character won't affect prediction results. Lastly, we will split each review to separate words.

In [14]:
word_reviews = []
word_unlabeled = []
all_words = []
for review in reviews_processed:
    word_reviews.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())

for review in unlabeled_processed:
    word_unlabeled.append(review.lower().split())
    for word in review.split():
        all_words.append(word.lower())
    
counter = Counter(all_words)
vocab = sorted(counter, key=counter.get, reverse=True)

#### Step 1.5 Creating vocab_to_int dictionary which will map word with a number

In [15]:
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}

#### Step 1.6 Using vocab_to_int to transform each review to vector of numbers

In [16]:
reviews_to_ints = []
for review in word_reviews:
    reviews_to_ints.append([vocab_to_int[word] for word in review])

In [17]:
unlabeled_to_ints = []

for review in word_unlabeled:
    unlabeled_to_ints.append([vocab_to_int[word] for word in review])

#### Step 1.7 Check if we have some 0 length reviews.

In [18]:
reviews_lens = Counter([len(x) for x in reviews_to_ints])
print('Zero-length {}'.format(reviews_lens[0]))
print("Max review length {}".format(max(reviews_lens)))

Zero-length 0
Max review length 931


#### Step 1.8 Creating word vectors

This step can be done in this way: 
    1. Define sequence length. (250 in this case)
    2. Each review shorted then this sequence will be padded (at the beginning) with zeros
    3. Each review longer than the sequence length will be shortened.

In [19]:
seq_len = 250

features = np.zeros((len(reviews_to_ints), seq_len), dtype=int)
for i, review in enumerate(reviews_to_ints):
    features[i, -len(review):] = np.array(review)[:seq_len]
    
features_test = np.zeros((len(unlabeled_to_ints), seq_len), dtype=int)
for i, review in enumerate(unlabeled_to_ints):
    features_test[i, -len(review):] = np.array(review)[:seq_len]

#### Step 1.9 Split into training and testing parts

In [20]:
X_train = features[:6400]
y_train = labels[:6400]

X_test = features[6400:]
y_test = labels[6400:]

X_unlabeled = features_test

print('X_trian shape {}'.format(X_train.shape))
print('X_unlabeled shape {}'.format(X_unlabeled.shape))

X_trian shape (6400, 250)
X_unlabeled shape (28936, 250)


### Done with preprocessing pipeline

## Step 2. Defining RNN

In [17]:
hidden_layer_size = 512 # how many nodes LSTM cells will have
number_of_layers = 1 # how many RNN layers the network will use
batch_size = 100 # how many reviews we feed at onces
learning_rate = 0.001 # learning rate
number_of_words = len(vocab_to_int) + 1 #how many unique words do we have in vocab (+1  is used for 0 - padding)
dropout_rate = 0.8 
embed_size = 300 #how long our word embedings will be
epochs = 6 # how many epochs do we use for training

In [18]:
tf.reset_default_graph() #Clean the graph

#### Step 2.1 Define placeholders

In [19]:
inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
targets = tf.placeholder(tf.int32, [None, None], name='targets')

#### Step 2.2 Define embeding layer

In [20]:
word_embedings = tf.Variable(tf.random_uniform((number_of_words, embed_size), -1, 1))
embed = tf.nn.embedding_lookup(word_embedings, inputs)

#### Step 2.3 Define hidden layer and Dynamic RNN

In [21]:
hidden_layer = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)
hidden_layer = tf.contrib.rnn.DropoutWrapper(hidden_layer, dropout_rate)

cell = tf.contrib.rnn.MultiRNNCell([hidden_layer]*number_of_layers)
init_state = cell.zero_state(batch_size, tf.float32)

W0112 01:15:58.993650 11256 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0112 01:15:58.993650 11256 deprecation.py:323] From <ipython-input-21-0abb52186384>:1: BasicLSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W0112 01:15:58.996642 11256 deprecation.py:323] From <ipython-input-21-0abb52186384>:4: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is e

In [22]:
outputs, states = tf.nn.dynamic_rnn(cell, embed, initial_state=init_state)

W0112 01:15:59.013597 11256 deprecation.py:323] From <ipython-input-22-54c760d65afb>:1: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
W0112 01:15:59.259591 11256 deprecation.py:506] From c:\anaconda\envs\tf\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0112 01:15:59.265543 11256 deprecation.py:506] From c:\anaconda\envs\tf\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py:738: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype 

#### Step 2.4 Get the prediction for each review 

From the last step of our network we get output and use it as a prediction. Than we use that result and compare it with real sentiment for that review.

In [23]:
prediction = tf.layers.dense(outputs[:, -1], 1, activation=tf.sigmoid)
prediction = tf.identity(prediction, name = "Prediction")
cost = tf.losses.mean_squared_error(targets, prediction)

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

W0112 01:15:59.522301 11256 deprecation.py:323] From <ipython-input-23-b2590b414f32>:1: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
W0112 01:15:59.819401 11256 deprecation.py:323] From c:\anaconda\envs\tf\lib\site-packages\tensorflow\python\ops\losses\losses_impl.py:121: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


#### Step 2.5 Define accuracy

In [24]:
currect_pred = tf.equal(tf.cast(tf.round(prediction), tf.int32), targets)
accuracy = tf.reduce_mean(tf.cast(currect_pred, tf.float32))

## Step 3. Training

In [25]:
session = tf.Session()

In [26]:
session.run(tf.global_variables_initializer())

In [None]:
for i in range(epochs):
    training_accurcy = []
    ii = 0
    epoch_loss = []
    while ii + batch_size <= len(X_train):
        X_batch = X_train[ii:ii+batch_size]
        y_batch = y_train[ii:ii+batch_size].reshape(-1, 1)
        
        a, o, _ = session.run([accuracy, cost, optimizer], feed_dict={inputs:X_batch, targets:y_batch})

        training_accurcy.append(a)
        epoch_loss.append(o)
        ii += batch_size
    print('Epoch: {}/{}'.format(i, epochs), ' | Current loss: {}'.format(np.mean(epoch_loss)),
          ' | Training accuracy: {:.4f}'.format(np.mean(training_accurcy)*100))

Epoch: 0/6  | Current loss: 0.045098960399627686  | Training accuracy: 93.6875
Epoch: 1/6  | Current loss: 0.009381931275129318  | Training accuracy: 98.8125
Epoch: 2/6  | Current loss: 0.00508140679448843  | Training accuracy: 99.3750
Epoch: 3/6  | Current loss: 0.002901164349168539  | Training accuracy: 99.6562
Epoch: 4/6  | Current loss: 0.001484645064920187  | Training accuracy: 99.8437


In [None]:
saver = tf.train.Saver()
saver.save(session,'Model/saved_model')

In [None]:
test_accuracy = []

ii = 0
while ii + batch_size <= len(X_test):
    X_batch = X_test[ii:ii+batch_size]
    y_batch = y_test[ii:ii+batch_size].reshape(-1, 1)

    a = session.run([accuracy], feed_dict={inputs:X_batch, targets:y_batch})
    
    test_accuracy.append(a)
    ii += batch_size

In [None]:
print("Test accuracy is {:.4f}%".format(np.mean(test_accuracy)*100))
session.close()

## Step 4. Testing on the unlabeld data

In [125]:
predictions_unlabeled = []
ii = 0
batch_size = 1
while ii + batch_size <= len(X_unlabeled[:1]):
    if ii + batch_size > len(X_unlabeled[:1]):
        batch_size = len(X_unlabeled[:1]) - ii
    X_batch = X_unlabeled[ii:ii+batch_size]
    y_batch = X_unlabeled[ii:ii+batch_size].reshape(-1, 1)

    #pred = session.run([prediction], feed_dict={inputs:X_batch, targets:y_batch})
    pred = sess.run([prediction], feed_dict={x:X_batch, y:y_batch})
    
    predictions_unlabeled.append(pred)
    ii += batch_size

In [126]:
pred_real = []
for i in range(len(predictions_unlabeled)):
    for ii in range(len(predictions_unlabeled[i][0])):
        if predictions_unlabeled[i][0][ii][0] >= 0.5:
            pred_real.append(1)
        else:
            pred_real.append(0)

In [None]:
np.savetxt('predictions.txt', pred_real)

In [None]:
new_dataframe = unlabeld_data[:len(pred_real)]

In [None]:
new_dataframe['Classes'] = pred_real

In [21]:
sess= tf.Session()
saver= tf.train.import_meta_graph('Model/saved_model.meta')
saver.restore(sess,tf.train.latest_checkpoint('Model/'))

graph = tf.get_default_graph()
x= graph.get_tensor_by_name("inputs:0")
y= graph.get_tensor_by_name("targets:0")
prediction = graph.get_tensor_by_name("Prediction:0")

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from Model/saved_model


In [25]:
pred = sess.run([prediction],feed_dict = {x : X_unlabeled[2:3], y: X_unlabeled[2:3].reshape(-1,1)})

In [26]:
predictions_unlabeled = []
predictions_unlabeled.append(pred)
pred_real = []
for i in range(len(predictions_unlabeled)):
    for ii in range(len(predictions_unlabeled[i][0])):
        if predictions_unlabeled[i][0][ii][0] >= 0.5:
            pred_real.append(1)
        else:
            pred_real.append(0)

In [32]:
X_unlabeled[1:2]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [30]:
unlabeld_data[1:2]

Unnamed: 0,Data
8797,stupid United airlines.


In [111]:
ii = 0
X_batch = X_unlabeled[ii:ii+batch_size]
y_batch = X_unlabeled[ii:ii+batch_size].reshape(-1, 1)

In [121]:
len(X_unlabeled[0:100])

100

In [128]:
unlabeld_data.shape

(28936, 1)

In [34]:
X_unlabeled[1:2]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [35]:
unlabeld_data[1:2]

Unnamed: 0,Data
8797,stupid United airlines.
