<b>Sentiment Analysis with TensorFlow and Python </b>
<br/>
In this notebook, we create a simple NN using TensorFlow and Python and attempt to do the sentiment analysis.

In [1]:
#imports
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import random
import pickle
from collections import Counter
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
#from tensorflow.examples.tutorials.mnist import input_data
import pickle
import numpy as np

In [2]:
#config files and data files
posFile = "data\sentiment\pos.txt"
negFile = "data\sentiment\\neg.txt"
lemmatizer = WordNetLemmatizer()
hm_lines = 100000

In [3]:
#build the lexicon
def create_lexicon(pos,neg):
    lexicon = []
    with open(pos,'rb') as f:
        contents = f.readlines()
    f.close()
    
    for l in contents[:hm_lines]:
        l = l.decode("utf8")
        all_words = word_tokenize(l)
        lexicon += list(all_words)
    
    with open(neg,'rb') as f:
        contents = f.readlines()
    f.close()
    
    for l in contents[:hm_lines]:
        l = l.decode("utf8")
        all_words = word_tokenize(l)
        lexicon += list(all_words)
    
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    
    w_counts = Counter(lexicon) #provides index by couting the unique elements in the list
    
    l2 = []
    #print(type(lexicon), type(w_counts))
    for w in w_counts:
        #print(w_counts[w]) #w_counts[w] gives the index of the word
        if 1000 > w_counts[w] > 50: #only consider the words that occur >50 times and less than 1000 times
            l2.append(w.lower())
    #print(len(l2))
    return l2

In [4]:
'''
This will iterate through the "sample" file that we choose. In our case, this is the pos.txt or neg.txt.
We also pass the lexicon, and the classification of the file coming through.
From here, it tokenizes the sample file by word, then lemmatizes the words.
Now, we begin with a numpy.zeros array that is the length of the lexicon.
'''
def sample_handling(sample, lexicon, classification):
    featureset = []
    with open(sample, 'r') as f:
        contents = f.readlines()
    f.close()
    
    for l in contents[:hm_lines]:
        current_words = word_tokenize(l.lower())
        current_words = [lemmatizer.lemmatize(i) for i in current_words]
        features = np.zeros(len(lexicon))
        for word in current_words:
            if word.lower() in lexicon:
                index_value = lexicon.index(word.lower())
                features[index_value] += 1 # just like one-hot encoding
        features = list(features)
        featureset.append([features, classification])
    return featureset

The following method creates feature set and labels from the positive and negative text files. We use 0.1 (ie.e 10%) of the whole data as the test data. We use [1,0] as the label for positive and [0,1] as the label for negative.

In [5]:
def create_feature_sets_and_labels(pos, neg, test_size=0.1):
    lexicon = create_lexicon(pos, neg)
    features = []
    features += sample_handling(posFile, lexicon, [1, 0])
    features += sample_handling(negFile, lexicon, [0, 1])
    random.shuffle(features)
    features = np.array(features)

    testing_size = int(test_size * len(features))
    #print(type(features))
    #leave the last "testing_size" elements as the test data and take the rest as the train data
    train_x = list(features[:, 0][:-testing_size])
    #leave the last "testing_size" elements as the test label and take the rest as the train label
    train_y = list(features[:, 1][:-testing_size])
    #take the last "testing_size" elements as the test data and leave the rest as the train data
    test_x = list(features[:, 0][-testing_size:])
    #take the last "testing_size" elements as the test label and leave the rest as the train label
    test_y = list(features[:, 1][-testing_size:])
    
    return train_x, train_y, test_x, test_y

#create_lexicon(posFile, negFile)

Now, we create a NN model using the data. We have three hidden layers. The first hidden layer uses the input data, the weight for this hidden layer and the bias for this hidden layer in its RELU. The second hidden layer uses the output of first hidden layer as its input. The same is repeated for the third hidden layer. The output layer uses the output of the third hidden layer as its input. The output is simply the matrix multiplication of the input to the output layer, the weights of the nodes in the output layer and the associated bias. We can also use more sophisticated functions in this.

In [6]:
#lets gete the train and test data and their labels
train_x,train_y,test_x,test_y = create_feature_sets_and_labels(posFile, negFile)

In [7]:
def neural_network_model(data):
    
    l1 = tf.add(tf.matmul(data, hidden_1_layer['weight']), hidden_1_layer['bias'])
    #Computes rectified linear: max(l1, 0).
    l1 = tf.nn.relu(l1)

    l2 = tf.add(tf.matmul(l1, hidden_2_layer['weight']), hidden_2_layer['bias'])
    #Computes rectified linear: max(l2, 0).
    l2 = tf.nn.relu(l2)

    l3 = tf.add(tf.matmul(l2, hidden_3_layer['weight']), hidden_3_layer['bias'])
    #Computes rectified linear: max(l3, 0).
    l3 = tf.nn.relu(l3)

    output = tf.matmul(l3, output_layer['weight']) + output_layer['bias']

    return output

Next, we define a method to train the neural network. The data is processed in batches and we use 10 epochs to process the data.

In [8]:
def train_neural_network(x):
    prediction = neural_network_model(x)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)

    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        writer = tf.summary.FileWriter("graph/sentanalysis", graph=tf.get_default_graph())
        for epoch in range(hm_epochs):
            epoch_loss = 0
            i = 0
            while i < len(train_x):
                start = i
                end = i + batch_size
                batch_x = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])

                _, c = sess.run([optimizer, cost], feed_dict={x: batch_x,
                                                              y: batch_y})
                #lets see the dimension of the input-first hidden layer
                #print("shape of input-1st hidden layer:",type(x), type(tf.get_variable("hidden_1_layer")['weight']))
                #print(c)
                epoch_loss += c
                i += batch_size

            print('Epoch', epoch + 1, 'completed out of', hm_epochs, 'loss:', epoch_loss)
        #print("prediction was:",prediction[:,0]," y was:",y)
        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        #tf.Print(correct, [correct])
        print('Accuracy:', accuracy.eval({x: test_x, y: test_y}))
        #print(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

Now, we initialize some parameters. We use three hidden layers with each layer having 1500 nodes. We operate in batches of size 100 and we repeat the process for 10 epochs.

In [9]:
#we use the sentiment prediction problem with the image classification network
# if you want to pickle this data:
#with open('/path/to/sentiment_set.pickle','wb') as f:
#	pickle.dump([train_x,train_y,test_x,test_y],f)

n_nodes_hl1 = 1500
n_nodes_hl2 = 1500
n_nodes_hl3 = 1500

n_classes = 2
batch_size = 100
hm_epochs = 2

For tensorflow, we need to define the elements so that they fit in the graph. We define every elements, i.e. the hidden layer, output layer as the dictionary where each item points to the resepective value or the placeholder that defines the value when the placeholder is executed. The x and y variables are just defined as placeholder because they will be initialized by the data and label for each batch within the loop.
The weights of the hidden layer are tensors of size [inputnodes x nodes_in_current_layer] and are initialized as random and uniformly distributed values with mean =0 and variance =1 (default of tf.random_normal(....)). Every layer also defines some bias values which is also taken from the random normal distribution.

In [10]:
x = tf.placeholder('float', name='x')
y = tf.placeholder('float', name='y')

hidden_1_layer = {'name':'hid_1',
                    'f_fum': n_nodes_hl1,
                  'weight': tf.Variable(tf.random_normal([len(train_x[0]), n_nodes_hl1])),
                  'bias': tf.Variable(tf.random_normal([n_nodes_hl1]))}

hidden_2_layer = {'name':'hid_2', 'f_fum': n_nodes_hl2,
                  'weight': tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
                  'bias': tf.Variable(tf.random_normal([n_nodes_hl2]))}

hidden_3_layer = {'name':'hid_3','f_fum': n_nodes_hl3,
                  'weight': tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
                  'bias': tf.Variable(tf.random_normal([n_nodes_hl3]))}

output_layer = {'name':'out_1','f_fum': None,
                'weight': tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
                'bias': tf.Variable(tf.random_normal([n_classes])), }

In [11]:
#lets create the log of the tensor graph to visualize it in tensorborad
writer = tf.summary.FileWriter("graph/sentanalysis", graph=tf.get_default_graph())
train_neural_network(x)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Epoch 1 completed out of 2 loss: 1155276.79004
Epoch 2 completed out of 2 loss: 434481.717773
Accuracy: 0.609756


<b> References</b>
<br/>
1) https://pythonprogramming.net/using-our-own-data-tensorflow-deep-learning-tutorial/?completed=/tensorflow-neural-network-session-machine-learning-tutorial/