In [1]:
import numpy as np
import keras 
import pandas as pd
from keras.layers import *
from keras.models import *
import glob
import os
import math
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from sklearn.metrics import classification_report
import time

%matplotlib inline



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
unknown_token = "UNKOWN_TOKEN"
empty_token   = "EMPTY_TOKEN"

In [3]:
os.listdir()

['.ipynb_checkpoints',
 'glove short',
 'glove.42B.300d.txt',
 'neg',
 'pos',
 'Sentiment analysis.ipynb',
 'test.zip',
 'train.zip']

In [4]:
def get_folder(address):
    '''
    adds all the text files in a folder together
    '''
    
    listRes = list()
    
    for i in os.listdir(address):
        with open(address + i, encoding="utf8") as file:
            listRes.append(file.read())
            
    return listRes

In [5]:
def get_data(neg = "neg/", pos="pos/"):
    '''
    get's data for each type of comment and adds them all together
    '''
    
    listNeg = get_folder(neg)
    
    listPos = get_folder(pos)
    
    
    return listNeg, listPos
    
            

In [6]:
def get_glove_dict(w2v):
    '''
    Creates a dictionary from word2vec files in which the words are the keys and the vectors are the values
    '''
    dictRes = dict()
    for i, iv in enumerate(w2v):
        for j, jv in enumerate(iv.split("\n")):
            data = jv.split(" ")
            if len(data[1:]) > 0 :
                dictRes[data[0]] = [float(i) for i in data[1:]]
            
    return dictRes

In [7]:
def get_dicts(listNeg, listPos, dictW2v):
    
    '''
    this is a function that gives id to words for later use
    w2id : word to id
    id2w : id to word
    OCC : number of occurrence
    '''
    
    global unknown_token
    
    W2ID = dict()
    ID2W = dict()
    
    OCC  = dict()
    
    listAll = listNeg + listPos
    
    lastID = 0
    
    for i in listAll :
        tokens = word_tokenize(i)
        
        for j in tokens :
            
            if j in dictW2v.keys() :
                
                if not (j in W2ID.keys()):
                
                    W2ID[j] = lastID
                    ID2W[lastID] = j
                    lastID += 1
                    OCC[j] = 1
                    
                else:
                    OCC[j] +=1
                    
            else :
                if not(unknown_token in W2ID.keys()):
                    W2ID[unknown_token] = lastID
                    ID2W[lastID] = unknown_token
                    lastID += 1
                    OCC[unknown_token] = 1
                else:
                    OCC[unknown_token] += 1
    
    return W2ID, ID2W, OCC
    
    

In [8]:
def get_spec_data(data, label, w2id, dictW2v):
    
    '''
    for each data , it receives the words and in return puts the vectors of word2vec in their place
    if the word is not available in our dictionary we will add an unkown token vector
    which was made by a random vector using the variance of the available vectors
    '''
    
    
    X = []
    y = []
    
    global unknown_token
    
    for i in data :
        
        tokens = word_tokenize(i)
        temp = []
        for token in tokens :
            
            if token in w2id.keys():
                temp.append(dictW2v[token])
            
            else :
                temp.append(dictW2v[unknown_token])
        X.append(np.array(temp))
        y.append(label)
                
    return X, y

In [9]:
def get_final_data(listNeg, listPos, w2id, id2w, dictW2v):
    
    '''
    concats all data (0 and 1 s) 
    '''
    
    Xn, yn = get_spec_data(listNeg, 0, w2id, dictW2v)
    
    Xp, yp = get_spec_data(listPos, 1, w2id, dictW2v)
    
    X = Xn + Xp
    y = yn + yp
    
    X = np.array(X)
    y = np.array(y)
    
    return X, y


In [10]:
class DataForRNN:
    
    '''
    aranges data for the RNN to use
    functions will be added in the future for more ease at the time of use
    '''
    
    def __init__(self, X, y):
        
        self.dictFinalX = dict()
        self.dictFinaly = dict()
        
        for i, iv in enumerate(X):
            
            if iv.shape[0] in self.dictFinalX.keys():
                
                self.dictFinalX[iv.shape[0]].append(iv)
                self.dictFinaly[iv.shape[0]].append(y[i])
            
            else :
                
                self.dictFinalX[iv.shape[0]] = [iv]
                self.dictFinaly[iv.shape[0]] = [y[i]]
            
        for i in self.dictFinalX.keys():
            
            self.dictFinalX[i] = np.array(self.dictFinalX[i]) 
            self.dictFinaly[i] = np.array(self.dictFinaly[i])
        
        
    

In [77]:
def get_model(shape):
    '''
    Uses Rnns with LSTM gate for predicting 
    '''
    
    in1 = Input(shape)
    
    X   = LSTM(40, return_sequences=False)(in1)
#     X   = LSTM(20, return_sequences=False)(X)
#     X   = LSTM(20, return_sequences=False)(X)
#     X   = Dense(10, activation=keras.activations.relu)(X)
    X   = Dense(10, activation=keras.activations.relu)(X)
#     X   = Dense(5, activation=keras.activations.relu)(X)
    X   = Dense(5, activation=keras.activations.relu)(X)
    X   = Dense(1, activation=keras.activations.sigmoid)(X)
    
    
    model = Model(in1, X)
    
    return model
    

In [12]:
def get_model_cnn(shape):
    
    '''
    creates a model using cnn for predicting
    '''
    
    in1 = Input(shape)
    
    X   = Conv1D(150, 5, padding="SAME")(in1)
    X   = Conv1D(50, 5, strides=2, padding="SAME")(X)
    X   = Conv1D(150, 5, padding="SAME")(X)
    X   = Conv1D(50, 5, strides=2, padding="SAME")(X)
    
    
    X   = Flatten()(X)
    
    X   = Dense(100, activation=keras.activations.relu)(X)
    X   = Dense(50,  activation=keras.activations.relu)(X)
    X   = Dense(50,  activation=keras.activations.relu)(X)
    X   = Dense(1,  activation=keras.activations.sigmoid)(X)
    
    
    
    model = Model(in1, X)
    
    
    return model
    

In [13]:
def get_data_the_same(X, y, empty_vec, mean = True):
    
    '''
    changes the size of all data to one size so we'll be able to use it for CNN
    if Mean is true , the sizes are changed into the avg size of the sentences
    '''
    
    global empty_token
    
    
    sizes = np.array([i.shape[0] for i in X])
    mean  = np.mean(sizes)
    maxSize = np.max(sizes)
    
    if mean :
        size = int(mean)
    else:
        size = int(maxSize)
    
    res = []
    
    for i, iv in enumerate(X):
        
        if iv.shape[0] > size :
            
            res.append(iv[ : size, : ]) 
            
        elif iv.shape[0] < size :
            
            needsToBeAdded = np.array([ empty_vec for i in range(size - iv.shape[0])])
            
            res.append(np.concatenate([iv, needsToBeAdded], axis = 0))
            
            del needsToBeAdded
        else :
            res.append(iv)
    return np.array(res)

In [14]:
listNeg, listPos = get_data()

In [15]:
len(listNeg)

12500

In [16]:
len(listPos)

12500

In [17]:
print(listPos[0])

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [18]:
print(word_tokenize(listPos[0]))

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '``', 'Teachers', "''", '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '``', 'Teachers', "''", '.', 'The', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', "'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students', '.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'I', 'immediately', 'recalled', '...', '...', '...', 'at', '...', '...', '...', '.', 'High', '.', 'A'

In [19]:
w2v = get_folder("glove short/")

In [20]:
dictW2v = get_glove_dict(w2v)

This is the number of words available in our dictionary

In [21]:
len(dictW2v.keys())

400000

As you can see below the NLTK library has done a pretty good job 
and the things it has separated are also available in our dictionary

In [22]:
"'m" in dictW2v.keys()

True

The size of each vector is 50

In [23]:
len(dictW2v["the"])

50

Encode is gonna contain the vector value for each word so we can compute the variance for the vectors

In [24]:
encodes = np.array([np.array(dictW2v[i]) for i in dictW2v.keys()])

In [25]:
encodes.shape

(400000, 50)

In [26]:
encodes.var()

0.4148703462915355

Right now we haven't considered anything for the unknown tokens and also empty tokens (in case we decide to pad a sentence)

In [27]:
unknown_token in dictW2v.keys()

False

Here we will Consider it :D

In [28]:
unknown_vec = dictW2v[unknown_token] = np.random.uniform(-encodes.var(), +encodes.var(), 50)
empty_vec = dictW2v[empty_token] = np.random.uniform(-encodes.var(), +encodes.var(), 50)

In [29]:
w2id, id2w, occ = get_dicts(listNeg, listPos, dictW2v)

Here we have some basic information about our data

In [30]:
occ[unknown_token]

838916

In [31]:
occ["the"]

289306

In [32]:
occ["'m"]

4738

In [33]:
X, y = get_final_data(listNeg, listPos, w2id, id2w, dictW2v)

Here we have our data in vectors but as you can guess comments have diffrent sizes so X.shape has the len of 1

In [34]:
X.shape

(25000,)

In [35]:
X[0].shape

(123, 50)

In [36]:
y.shape

(25000,)

In [37]:
len(set([i.shape[0] for i in X]))

1195

So we have more than a 1000 diffrent sizes in comments

In [38]:
rnnData = DataForRNN(X, y)

In [39]:
shapeEx = rnnData.dictFinalX[list(rnnData.dictFinalX.keys())[0]].shape

In [40]:
shapeEx

(56, 123, 50)

So in the upward output we see how many comments share the first size

In [78]:
model = get_model((None, shapeEx[-1]))

In [79]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, None, 50)          0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 40)                14560     
_________________________________________________________________
dense_13 (Dense)             (None, 10)                410       
_________________________________________________________________
dense_14 (Dense)             (None, 5)                 55        
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 6         
Total params: 15,031
Trainable params: 15,031
Non-trainable params: 0
_________________________________________________________________


In [80]:
model.compile(keras.optimizers.Adam(lr = 1e-3), keras.losses.binary_crossentropy, metrics=["accuracy"])

In [44]:
del(listNeg)
del(listPos)
del(dictW2v)

In [45]:
size = list(rnnData.dictFinalX.keys())[20]

In [46]:
for s in list(rnnData.dictFinalX.keys())[:1]:
    example_size = rnnData.dictFinaly[s].shape[0]
    if example_size <16 :
        batch_size = 2
    elif example_size < 128 :
        batch_size = 8
    else :
        batch_size = 16
    model.fit(rnnData.dictFinalX[s], rnnData.dictFinaly[s], epochs=30, batch_size=batch_size, verbose=0)

I decided to only use the first size because of resources but you can ofcourse use the whole data :D
below we see how much our model learned from training datas first size

In [47]:
key = list(rnnData.dictFinalX.keys())[0]
y_pred = model.predict(rnnData.dictFinalX[key])
print(classification_report(rnnData.dictFinaly[key], y_pred > 0.5))

             precision    recall  f1-score   support

          0       0.96      1.00      0.98        27
          1       1.00      0.97      0.98        29

avg / total       0.98      0.98      0.98        56



technically we should use diffrent sets for learning and testing but this section was only to show you how to use diffrent sizes for RNN

In [48]:
del(rnnData)

In [49]:
X = get_data_the_same(X, y, empty_vec)

In [50]:
X[0].shape

(282, 50)

In [51]:
ind = np.arange(X.shape[0])
np.random.shuffle(ind)
X = X [ind]

y  = y  [ind]


In [52]:
model_cnn = get_model_cnn(X.shape[1:])

In [53]:
model_cnn.compile(keras.optimizers.Adam(lr = 1e-3), keras.losses.binary_crossentropy, metrics = ["accuracy"])

Here you will see the accuracy for CNN

In [56]:
p = 0.9
row = int(X.shape[0]*p)
model_cnn.fit(X[:row], y[:row], epochs=5, batch_size=16, validation_data=[X[row:], y[row:]])

Train on 22500 samples, validate on 2500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x293ffdd7f28>

And here the accuracy for RNN

In [83]:
model.reset_states()
size = 1000
row = int(p * size)
model.fit(X[:row], y[:row], epochs=10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x293f8edf3c8>

In [84]:
print(classification_report(y[row:size], model.predict(X[row:size]) > 0.5))

             precision    recall  f1-score   support

          0       0.47      0.76      0.58        49
          1       0.45      0.20      0.27        51

avg / total       0.46      0.47      0.43       100



as you can see above it takes much more time to train the RNN and even in the same epochs it's not doing as good but you should take into considiration that RNNs can understand more complex patterns <br>
if you have time you can use all the data for training and testing use more epochs and also maybe smaller bacth_size for improving the result

P.s this notebook will be updated soon , thanks for reading :D