Load data for network from file

In [1]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import random
import pickle
from collections import Counter
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
hm_lines = 100000

def create_lexicon(pos,neg):

    lexicon = []
    with open(pos,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            all_words = word_tokenize(l)
            lexicon += list(all_words)

    with open(neg,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            all_words = word_tokenize(l)
            lexicon += list(all_words)

    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon)
    l2 = []
    for w in w_counts:
        #print(w_counts[w])
        if 1000 > w_counts[w] > 50:
            l2.append(w)
    print(len(l2))
    return l2





def sample_handling(sample,lexicon,classification):

    featureset = []

    with open(sample,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1

            features = list(features)
            featureset.append([features,classification])

    return featureset



def create_feature_sets_and_labels(pos,neg,test_size = 0.1):
    lexicon = create_lexicon(pos,neg)
    features = []
    features += sample_handling(pos,lexicon,[1,0])
    features += sample_handling(neg,lexicon,[0,1])
    random.shuffle(features)
    features = np.array(features)

    testing_size = int(test_size*len(features))

    train_x = list(features[:,0][:-testing_size])
    train_y = list(features[:,1][:-testing_size])
    test_x = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])

    return train_x,train_y,test_x,test_y

In [2]:
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from pprint import pprint
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np



In [3]:
train_x,train_y,test_x,test_y = create_feature_sets_and_labels('Data/pos.txt','Data/neg.txt')

n_nodes_hl1 = 1500
n_nodes_hl2 = 1500
n_nodes_hl3 = 1500

n_classes = 2
batch_size = 100
hm_epochs = 10


423


In [4]:
class TfAnn(object):
    
    def __init__(self):
        self.hidden_1_layer = {'weights':[],'biases':[]}
        self.hidden_2_layer = {'weights':[],'biases':[]}
        self.hidden_3_layer = {'weights':[],'biases':[]}
        self.output_layer = {'weights':[],'biases':[]}
        self.np_hidden_1_layer={"weights":[],"biases":[]}
        self.np_hidden_2_layer={"weights":[],"biases":[]}
        self.np_hidden_3_layer={"weights":[],"biases":[]}
        self.np_output_layer={"weights":[],"biases":[]}
    
    def init_empty(self,size,n_nodes_hl1,n_nodes_hl2,n_nodes_hl3,n_classes):
        self.hidden_1_layer = {'weights':tf.Variable(tf.random_normal([size, n_nodes_hl1])),
                      'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}

        self.hidden_2_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
                          'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}

        self.hidden_3_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
                          'biases':tf.Variable(tf.random_normal([n_nodes_hl3]))}

        self.output_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
                        'biases':tf.Variable(tf.random_normal([n_classes]))}


    def init_values(self,size,l1_weights,l2_weights,l3_weights,out_weights,l1_biases,l2_biases,l3_biases,out_biases):
        self.hidden_1_layer = {'weights':tf.Variable(l1_weights),'biases':tf.Variable(l1_biases)}
        self.hidden_2_layer = {'weights':tf.Variable(l2_weights),'biases':tf.Variable(l2_biases)}
        self.hidden_3_layer = {'weights':tf.Variable(l3_weights),'biases':tf.Variable(l3_biases)}
        self.output_layer = {'weights':tf.Variable(out_weights),'biases':tf.Variable(out_biases)}


    def create(self,data):
        # This is the heart of the ann where multiply the data by the wights to the layers 
        l1 = tf.add(tf.matmul(data,self.hidden_1_layer['weights']), self.hidden_1_layer['biases'])
        l1 = tf.nn.relu(l1)

        l2 = tf.add(tf.matmul(l1,self.hidden_2_layer['weights']), self.hidden_2_layer['biases'])
        l2 = tf.nn.relu(l2)

        l3 = tf.add(tf.matmul(l2,self.hidden_3_layer['weights']), self.hidden_3_layer['biases'])
        l3 = tf.nn.relu(l3)

        output =  tf.add(tf.matmul(l3,self.output_layer['weights']) , self.output_layer['biases'])

        return output
    
    def extract(self):
        self.np_hidden_1_layer["weights"] = neural_network_model.hidden_1_layer["weights"].eval()
        self.np_hidden_2_layer["weights"] = neural_network_model.hidden_2_layer["weights"].eval()
        self.np_hidden_3_layer["weights"] = neural_network_model.hidden_3_layer["weights"].eval()
        self.np_output_layer["weights"] = neural_network_model.output_layer["weights"].eval()
        self.np_hidden_1_layer["biases"] = neural_network_model.hidden_1_layer["biases"].eval()
        self.np_hidden_2_layer["biases"] = neural_network_model.hidden_2_layer["biases"].eval()
        self.np_hidden_3_layer["biases"] = neural_network_model.hidden_3_layer["biases"].eval()
        self.np_output_layer["biases"] = neural_network_model.output_layer["biases"].eval()
        

In [5]:
def train_neural_network_interactive(neural_network_model):
    x= tf.placeholder('float')
    y = tf.placeholder('float')
    prediction = neural_network_model.create(x)
    cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y) )
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    hm_epochs = 10
    sess = tf.InteractiveSession()
    init = tf.initialize_all_variables()
    sess.run(init)
    for epoch in range(hm_epochs):
            epoch_loss = 0
            i=0
            while i < len(train_x):
                start = i
                end = i+batch_size
                batch_x = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])

                _, c = sess.run([optimizer, cost], feed_dict={x: batch_x,
                                                              y: batch_y})
                epoch_loss += c
                i+=batch_size
            print('Epoch', epoch, 'completed out of',hm_epochs,'loss:',epoch_loss)

    neural_network_model.extract()
    correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
    print('Accuracy:',accuracy.eval({x:test_x, y:test_y}))

In [6]:
neural_network_model = TfAnn()
neural_network_model.init_empty(len(train_x[0]),n_nodes_hl1,n_nodes_hl2,n_nodes_hl3,n_classes)
train_neural_network_interactive(neural_network_model)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Epoch 0 completed out of 10 loss: 1230083.8901367188
Epoch 1 completed out of 10 loss: 506106.4841308594
Epoch 2 completed out of 10 loss: 403998.9543457031
Epoch 3 completed out of 10 loss: 317235.5387878418
Epoch 4 completed out of 10 loss: 131969.86729431152
Epoch 5 completed out of 10 loss: 54791.73278045654
Epoch 6 completed out of 10 loss: 50840.01009941101
Epoch 7 completed out of 10 loss: 48541.59055709839
Epoch 8 completed out of 10 loss: 52740.26072216034
Epoch 9 completed out of 10 loss: 53824.746154785156
Accuracy: 0.61538464
