# NLP using Tensorflow

### Sentiments Rating

Words Dictionary - <br/>
    > pip3 install nltk
    >>> import nltk
    >>> nltk.download()

In [15]:
import nltk
'''
Tokenize each word of the sentence as a member of list
'''
from nltk.tokenize import word_tokenize
'''
stem removes the tenses like - running, run -> run
Lemmatizer ensures that each word has an actual dictionary meaning
'''
from nltk.stem import WordNetLemmatizer
import numpy as np
import pickle
from collections import Counter
import random

lemmatizer = WordNetLemmatizer()
total_lines = 10000000


def create_lexicon(pos, neg):
    lexicon = []
    for file in [pos, neg]:
        with open(file, 'r') as f:
            contents = f.readlines()
            for line in contents[:total_lines]:
                all_words = word_tokenize(line)
                lexicon += list(all_words)
                
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon)
    '''
    We get dictionary of words with the total count of its occurence
    '''
    
    new_lexicon = []
    for w in w_counts:
        '''
        We do not want super common words
        and neither the really rare words
        '''
        if 1000 > w_counts[w] > 50:
            new_lexicon.append(w)
    
    print(len(new_lexicon))
    return new_lexicon

            
def sample_handling(sample, lexicon, classification):
    '''
    featureset will be a list of lists
    with the values 0 and 1 representing the occurence of each word.
    '''
    featureset = []
    
    with open(sample, 'r') as f:
        contents = f.readlines()
        for line in contents[:total_lines]:
            current_words = word_tokenize(line.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1
                features = list(features)
                featureset.append([features, classification])
        
    return featureset


def create_feature_sets_and_labels(pos, neg, test_size = 0.1):
    lexicon = create_lexicon(pos, neg)
    features = []
    features += sample_handling('pos.txt', lexicon, [1,0])
    features += sample_handling('neg.txt', lexicon, [0,1])
    '''
    shuffling is necessary to create a balanced neural network
    '''
    random.shuffle(features)
    
    features = np.array(features)
    
    testing_size = int(test_size*len(features))
    
    train_x = list(features[:,0][:-testing_size])
    train_y = list(features[:,1][:-testing_size])
    
    test_x = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])
    
    return train_x, train_y, test_x, test_y


if __name__ == '__main__':
    train_x, train_y, test_x, test_y = create_feature_sets_and_labels('pos.txt', 'neg.txt')
    with open('sentiment_set.pickle', 'wb') as f:
        pickle.dump([train_x, train_y, test_x, test_y], f)

print('Successfull!')      

423
Successfull!


In [17]:
import tensorflow as tf
import numpy as np

train_x, train_y, test_x, test_y = create_feature_sets_and_labels('pos.txt', 'neg.txt')

n_nodes_hidden_layer_1 = 500
n_nodes_hidden_layer_2 = 500
n_nodes_hidden_layer_3 = 500

n_classes = 2
batch_size = 100

x = tf.placeholder('float', [None, len(train_x[0])])
y = tf.placeholder('float')

def neural_network_model(data):
    hidden_layer_1 = {
        'weights': tf.Variable(tf.random_normal([len(train_x[0]), n_nodes_hidden_layer_1])),
        'biases': tf.Variable(tf.random_normal([n_nodes_hidden_layer_1]))
    }
    
    hidden_layer_2 = {
        'weights': tf.Variable(tf.random_normal([n_nodes_hidden_layer_1, n_nodes_hidden_layer_2])),
        'biases': tf.Variable(tf.random_normal([n_nodes_hidden_layer_2]))
    }
    
    hidden_layer_3 = {
        'weights': tf.Variable(tf.random_normal([n_nodes_hidden_layer_2, n_nodes_hidden_layer_3])),
        'biases': tf.Variable(tf.random_normal([n_nodes_hidden_layer_3]))
    }
    
    output_layer = {
        'weights': tf.Variable(tf.random_normal([n_nodes_hidden_layer_3, n_classes])),
        'biases': tf.Variable(tf.random_normal([n_classes]))
    }
    
    # (input_data * weights) + biases
    layer_1 = tf.add(tf.matmul(data, hidden_layer_1['weights']), hidden_layer_1['biases'])
    layer_1 = tf.nn.relu(layer_1)
    
    layer_2 = tf.add(tf.matmul(layer_1, hidden_layer_2['weights']), hidden_layer_2['biases'])
    layer_2 = tf.nn.relu(layer_2)
    
    layer_3 = tf.add(tf.matmul(layer_2, hidden_layer_3['weights']), hidden_layer_3['biases'])
    layer_3 = tf.nn.relu(layer_3)
    
    output = tf.add(tf.matmul(layer_3, output_layer['weights']), output_layer['biases'])
    
    return output

def train_neural_network(x):
    prediction = neural_network_model(x)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = prediction, labels = y))
    
    # default learning_rate = 0.001
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    #cycles feed_forward + backprop
    total_epochs = 10
    
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        
        for epoch in range(total_epochs):
            epoch_loss = 0
            
            i = 0
            while i < len(train_x):
                start = i
                end = i + batch_size
                
                batch_x = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])
                
                __, c = sess.run([optimizer, cost], feed_dict = {x: batch_x, y: batch_y})
                epoch_loss += c
                
                i += batch_size
            
            print('Epoch: ', epoch, ' completed out of ', total_epochs, '; loss: ', epoch_loss)
            
        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        print('Accuracy: ', accuracy.eval({x: test_x, y: test_y}))
        
train_neural_network(x)

423
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Epoch:  0  completed out of  10 ; loss:  820263.038733
Epoch:  1  completed out of  10 ; loss:  124230.17828
Epoch:  2  completed out of  10 ; loss:  71007.8575735
Epoch:  3  completed out of  10 ; loss:  60005.6615822
Epoch:  4  completed out of  10 ; loss:  50675.716779
Epoch:  5  completed out of  10 ; loss:  44982.2034805
Epoch:  6  completed out of  10 ; loss:  40489.946089
Epoch:  7  completed out of  10 ; loss:  35247.4727209
Epoch:  8  completed out of  10 ; loss:  32391.4346926
Epoch:  9  completed out of  10 ; loss:  29287.0620579
Accuracy:  0.870504


NEXT - https://pythonprogramming.net/data-size-example-tensorflow-deep-learning-tutorial/?completed=/train-test-tensorflow-deep-learning-tutorial/