## LSTM with Upsampled Data and single label

In [57]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import losses

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9202345109833095951
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 179992166
locality {
  bus_id: 1
}
incarnation: 18127857902157479760
physical_device_desc: "device: 0, name: GeForce GTX 980, pci bus id: 0000:04:00.0, compute capability: 5.2"
]


In [58]:
# !pip install pandas
# !pip install np_utils
# !pip uninstall tensorflow
# !pip install matplotlib


In [59]:
# train = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')

In [168]:
identity = pd.read_json('../Datasets/balanced_one_label/ToxicOptimal.json')

In [169]:
identity = identity.sample(frac = 1)

In [171]:
print(identity.shape)
identity.head()

(183528, 2)


Unnamed: 0,comment_text,toxic
164487,"[Hey, clown, meat, doll, fucks, dog, Goethe, p...",1
146224,"[stop, working, stupid, ass, bitch, ghosts, pa...",1
121469,"[welcome, discussion, page, communist, asshole...",1
96660,"[User, Talk, Hesperus, asked, someone, opinion...",1
175003,"[You, toilet, dally, indeterminate, blockade, ...",1


In [172]:
#To check for NULL values in training and test data
identity.isnull().any()

comment_text    False
toxic           False
dtype: bool

In [173]:
#the dependent variables are in the training set itself so we need to split them up, into X and Y sets.
#list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
list_classes = ["toxic"]
y = identity[list_classes].values
list_sentences_train = identity["comment_text"]
#list_sentences_test = test["comment_text"]

In [174]:
# list_sentences_train

In [175]:
# to feed the comments into the LSTM as part of the neural network, these steps should be followed:
# 1) Tokenization - We need to break down the sentence into unique words. For eg, "I love cats and love dogs" will become ["I","love","cats","and","dogs"]
# 2) Indexing - We put the words in a dictionary-like structure and give them an index each For eg, {1:"I",2:"love",3:"cats",4:"and",5:"dogs"}
# 3) Index Representation- We could represent the sequence of words in the comments in the form of index, and feed this chain of index into our LSTM. For eg, [1,2,3,4,2,5]

In [176]:
max_features = 20000
tokenizer = Tokenizer(num_words = max_features)

In [177]:
tokenizer

<keras.preprocessing.text.Tokenizer at 0x1beb04e1f60>

In [178]:
tokenizer.fit_on_texts(list(list_sentences_train))

In [179]:
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
#list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [180]:
list_tokenized_train[:1]

[[457, 2177, 2000, 5960, 1862, 416, 8352, 116, 123, 18, 7598, 427, 178]]

In [181]:
#we have to feed a stream of data that has a consistent length(fixed number of features). Use padding for this.
#trim the longer sentences to the same length(maxlen) as the short ones. In this case, we have set the max length to be 200.
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
#X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [182]:
type(X_t)

numpy.ndarray

In [183]:
#to see distribution of words
totalNumWords = [len(one_comment) for one_comment in list_tokenized_train]

In [184]:
# plt.hist(totalNumWords,bins = np.arange(0,410,10))#[0,50,100,150,200,250,300,350,400])#,450,500,550,600,650,700,750,800,850,900])
# plt.show()
#output shows that most of the sentence length is about 30+

### Model

#### Input Layer

In [185]:
#the inputs into our networks are our list of encoded sentences.
#We begin defining an Input layer that accepts a list of sentences that has a dimension of 200.

In [186]:
inp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier
#By indicating an empty space after comma, we are telling Keras to infer the number automatically

#### Embedding Layer

In [187]:
#here we project the words to a defined vector space depending on the distance of the surrounding words in a sentence
embed_size = 128
x = Embedding(max_features, embed_size)(inp)
#The output of the Embedding layer is just a list of the coordinates of the words in this vector space.
#For eg. (-81.012) for "cat" and (-80.012) for "dog".
#We could also use the distance of these coordinates to detect relevance and context. 


#### LSTM Layer

In [188]:
#We set the LSTM to produce an output that has a dimension of 60 and want it to return the whole unrolled sequence of results.
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)

In [189]:
#the output is a tensor.
#To reshape 3D tensor to 2D, we use a Global Max Pooling layer which is traditionally used in CNN problems
#to reduce the dimensionality of image data

In [190]:
x = GlobalMaxPool1D()(x)

#### Dropout Layer

In [191]:
x = Dropout(0.1)(x)
#After a drop out layer, we connect the output of drop out layer to a densely connected layer
#and the output passes through a RELU function.

#Activation( (Input X Weights) + Bias)

In [192]:

x = Dense(50, activation="relu")(x) #define the Dense layer to produce an output dimension of 50

In [193]:
#Feed output into dropout layer again
x = Dropout(0.1)(x)

In [194]:
#Feed output to Sigmoid Layer
x = Dense(1, activation="sigmoid")(x)

In [195]:
# We have set our model to optimize our loss function using Adam optimizer, 
# define the loss function to be "binary_crossentropy" since we are tackling a binary classification.
# Default learning rate is set at 0.001.

In [196]:
model = Model(inputs=inp, outputs=x)
# model.compile(loss='binary_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy'])
model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=['accuracy'])

In [160]:
batch_size = 128
epochs = 3
model_result = model.fit(X_t[0:183528],y[0:85000], batch_size=batch_size, epochs=epochs, validation_split=0.2)

Train on 68000 samples, validate on 17000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [161]:
# calculating predictions on  the remaining dataset
predictions = model.predict(X_t[85000:])

In [162]:
df = pd.DataFrame(y)

In [163]:
df = pd.DataFrame(y)
tmp = pd.DataFrame(columns = {"true"})
tmp["true"]  = df[0]

In [164]:
t = pd.DataFrame(predictions)
t[t[0] > 0.5] = 1
t[t[0] < 0.5] = 0
t = pd.concat([t, tmp.reset_index(drop = True)], axis = 1)


In [165]:
t["difference"] = t[0] - t["true"]

In [166]:
result = t["difference"].sum()/ len(predictions)
result

0.005879882402351953

### Sequential Modelling

In [40]:
# model = Sequential()
# model.add(Dense(50, input_dim=200))
# model.add(Activation('relu'))
# model.add(Dense(1, input_dim=60))
# model.add(Activation('sigmoid'))

In [41]:
# y_pred = model.predict_proba(X_t)
# print(y_pred)

[[0.0000000e+00]
 [0.0000000e+00]
 [1.0000000e+00]
 ...
 [1.0000000e+00]
 [9.9999988e-01]
 [1.0466949e-18]]


In [52]:
# # len(X_t)
# # len(y_pred)
# difference = y_pred-y
# #len(difference)
# avg = np.average(difference)
# avg

0.17383676450243005

In [167]:
model.save('lstm_model_insult.h5')

In [50]:
# from keras.models import load_model
# # load model from single file
# model = load_model('lstm_model_toxic.h5')
# # make predictions
# yhat = model.predict(X_t, verbose=0)
# print(yhat)