In [6]:
%matplotlib inline  
import os
import pickle
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [39]:
#####################################################################################
# dataset used: https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
# Build datasets for training and testing
#####################################################################################

datapath = '../datasets/sentiment'
file_names = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']


def get_lexicon():
    lexicon = []
    lex2 = []
    lemmatizer = WordNetLemmatizer()
    for file_name in file_names:
        with open(os.path.join(datapath, file_name), 'r') as file:
            for line in file.readlines():
                words = word_tokenize(line.split('\t')[0])
                lexicon += list(words)
    lexicon = [lemmatizer.lemmatize(_.lower()) for _ in lexicon if len(_)>2]
    
    word_counts = Counter(lexicon)
    for w in word_counts:
        if  500 > word_counts[w] > 10:
            lex2.append(w)
    print(len(lex2))
    return lex2


def get_features(lexicon):
    featureset = []
    
    for file_name in file_names:
        with open(os.path.join(datapath, file_name), 'r') as file:
            for line in file.readlines():
                features = np.zeros(len(lexicon))
                line, pol = line.split('\t')
                pol = int(pol)
                if pol == 0:
                    pol = [0,1]
                else:
                    pol = [1,0]
                words = word_tokenize(line)
                for word in words:
                    word = word.lower()
                    if word in lexicon:
                        idx = lexicon.index(word)
                        features[idx] += 1
                features = list(features)
                        
                featureset.append([features, pol])
    return featureset
   

def create_train_test_features(test_size=0.2):
    
    lexicon = get_lexicon()
    features = np.array(get_features(lexicon))
    
    size = len(features)
    train_size = int(size - size*test_size)
    
    train_x = list(features[:,0][:train_size])
    train_y = list(features[:,1][:train_size])
    test_x = list(features[:,0][train_size:])
    test_y = list(features[:,1][train_size:])
    
    return train_x, train_y, test_x, test_y

# train_x, train_y, test_x, test_y = create_train_test_features()
# with open(os.path.join(datapath, 'sentiment_set.pkl'), 'wb') as file:
#     pickle.dump([train_x, train_y, test_x, test_y], file)


396


In [43]:
#####################################################################################
# Build Classification model using Tensorflow
#####################################################################################

pkl_file_name = 'sentiment_set.pkl'
# pkl_file_name = 'sentex.pkl'

train_x, train_y, test_x, test_y  = [],[],[],[]
with open(os.path.join(datapath, pkl_file_name), 'rb') as file:
    train_x, train_y, test_x, test_y = pickle.load(file)


n_classes = 2
num_epochs = 10
batch_size = 100
nn_nodes_hl1 = 1500
nn_nodes_hl2 = 1500
nn_nodes_hl3 = 1500

x = tf.placeholder('float', [None, len(train_x[0])])
y = tf.placeholder('float')

def neural_network_model(data):
    hidden_layer1 = {'weights': tf.Variable(tf.random_normal([len(train_x[0]), nn_nodes_hl1])),
                    'biases': tf.Variable(tf.random_normal([nn_nodes_hl1]))}
    
    hidden_layer2 = {'weights': tf.Variable(tf.random_normal([nn_nodes_hl1, nn_nodes_hl2])),
                    'biases': tf.Variable(tf.random_normal([nn_nodes_hl2]))}
    
    hidden_layer3 = {'weights': tf.Variable(tf.random_normal([nn_nodes_hl2, nn_nodes_hl3])),
                    'biases': tf.Variable(tf.random_normal([nn_nodes_hl3]))}
    
    output_layer = {'weights': tf.Variable(tf.random_normal([nn_nodes_hl3, n_classes])),
                    'biases': tf.Variable(tf.random_normal([n_classes]))}
    
    l1 = tf.add(tf.matmul(data, hidden_layer1['weights']), hidden_layer1['biases'])
    l1 = tf.nn.relu(l1)
    
    l2 = tf.add(tf.matmul(l1, hidden_layer2['weights']), hidden_layer2['biases'])
    l2 = tf.nn.relu(l2)
    
    l3 = tf.add(tf.matmul(l2, hidden_layer3['weights']), hidden_layer3['biases'])
    l3 = tf.nn.relu(l3)
    
    output = tf.add(tf.matmul(l3, output_layer['weights']), output_layer['biases'])
    return output


def train_neural_network(x):
    prediction = neural_network_model(x)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)
    
    with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
        sess.run(tf.initialize_all_variables())
        
        for epoch in range(num_epochs):
            epoch_loss = 0
            i = 0
            while i < len(train_x):
                start = i
                end = i + batch_size
                
                batch_x = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])
                
                _, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
                
                epoch_loss += c
                i += batch_size
                
            print('Epoch: ', epoch, ' Completed out of: ', num_epochs, ' loss: ', epoch_loss)
        
        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
    
        print('Accuracy:',accuracy.eval({x:test_x, y:test_y}))

train_neural_network(x)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Epoch:  0  Completed out of:  10  loss:  397286.949707
Epoch:  1  Completed out of:  10  loss:  165049.135254
Epoch:  2  Completed out of:  10  loss:  165852.724243
Epoch:  3  Completed out of:  10  loss:  130939.307251
Epoch:  4  Completed out of:  10  loss:  30148.7880554
Epoch:  5  Completed out of:  10  loss:  12226.3799133
Epoch:  6  Completed out of:  10  loss:  6978.53742218
Epoch:  7  Completed out of:  10  loss:  4871.41127396
Epoch:  8  Completed out of:  10  loss:  2697.82446218
Epoch:  9  Completed out of:  10  loss:  3075.82834148
Accuracy: 0.703333
