## LSTM with Upsampled Data and single label

In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import losses

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 10542537763580801275
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3228522905
locality {
  bus_id: 1
}
incarnation: 4901094013820040992
physical_device_desc: "device: 0, name: GeForce GTX 980, pci bus id: 0000:04:00.0, compute capability: 5.2"
]


In [2]:
# !pip install pandas
# !pip install np_utils
# !pip install matplotlib
# !pip install keras

In [4]:
obscene = pd.read_csv('Severe64MMft300.csv')
obscene.shape
obscene = obscene.sample(frac = 1)

In [5]:
xx = obscene.iloc[:, 1:1001]
yy = obscene.iloc[:, -1]

In [6]:
#To check for NULL values in training and test data
xx.isnull().any().sum()

0

In [7]:
#the dependent variables are in the training set itself so we need to split them up, into X and Y sets.
#list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
# list_classes = ["toxic"]
# y = identity[list_classes].values
# list_sentences_train = identity["comment_text"]
#list_sentences_test = test["comment_text"]

In [8]:
# to feed the comments into the LSTM as part of the neural network, these steps should be followed:
# 1) Tokenization - We need to break down the sentence into unique words. For eg, "I love cats and love dogs" will become ["I","love","cats","and","dogs"]
# 2) Indexing - We put the words in a dictionary-like structure and give them an index each For eg, {1:"I",2:"love",3:"cats",4:"and",5:"dogs"}
# 3) Index Representation- We could represent the sequence of words in the comments in the form of index, and feed this chain of index into our LSTM. For eg, [1,2,3,4,2,5]

In [9]:
xx = xx.as_matrix()

### Model

#### Input Layer

In [10]:
#the inputs into our networks are our list of encoded sentences.
#We begin defining an Input layer that accepts a list of sentences that has a dimension of 200.

In [11]:
inp = Input(shape=(1000, )) #maxlen=200 as defined earlier
#By indicating an empty space after comma, we are telling Keras to infer the number automatically

#### Embedding Layer

In [12]:
#here we project the words to a defined vector space depending on the distance of the surrounding words in a sentence
embed_size = 128
x = Embedding(200, embed_size)(inp)
#The output of the Embedding layer is just a list of the coordinates of the words in this vector space.
#For eg. (-81.012) for "cat" and (-80.012) for "dog".
#We could also use the distance of these coordinates to detect relevance and context. 


#### LSTM Layer

In [13]:
#We set the LSTM to produce an output that has a dimension of 60 and want it to return the whole unrolled sequence of results.
x = LSTM(100, return_sequences=True,name='lstm_layer')(x)

In [14]:
#the output is a tensor.
#To reshape 3D tensor to 2D, we use a Global Max Pooling layer which is traditionally used in CNN problems
#to reduce the dimensionality of image data

In [15]:
x = GlobalMaxPool1D()(x)

#### Dropout Layer

In [16]:
x = Dropout(0.1)(x)
#After a drop out layer, we connect the output of drop out layer to a densely connected layer
#and the output passes through a RELU function.

#Activation( (Input X Weights) + Bias)

In [13]:

# x = Dense(50, activation="relu")(x) #define the Dense layer to produce an output dimension of 50

In [14]:
#Feed output into dropout layer again
# x = Dropout(0.1)(x)

In [17]:
#Feed output to Sigmoid Layer
x = Dense(1, activation="sigmoid")(x)

In [16]:
# We have set our model to optimize our loss function using Adam optimizer, 
# define the loss function to be "binary_crossentropy" since we are tackling a binary classification.
# Default learning rate is set at 0.001.

In [21]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
# model.compile(loss='mean_squared_error',
#                   optimizer='adam',
#                   metrics=['accuracy'])

In [22]:
batch_size = 128
epochs = 2
model_result = model.fit(xx,yy, batch_size=batch_size, epochs=epochs, validation_split=0.2,
                  callbacks=callbacks_list)

Train on 15306 samples, validate on 3827 samples
Epoch 1/2
 1920/15306 [==>...........................] - ETA: 2:06 - loss: 0.6915 - acc: 0.5094

KeyboardInterrupt: 

In [None]:
# calculating predictions on  the remaining dataset
predictions = model.predict(X_t[85000:])

In [162]:
df = pd.DataFrame(y)

In [163]:
df = pd.DataFrame(y)
tmp = pd.DataFrame(columns = {"true"})
tmp["true"]  = df[0]

In [164]:
t = pd.DataFrame(predictions)
t[t[0] > 0.5] = 1
t[t[0] < 0.5] = 0
t = pd.concat([t, tmp.reset_index(drop = True)], axis = 1)


In [165]:
t["difference"] = t[0] - t["true"]

In [166]:
result = t["difference"].sum()/ len(predictions)
result

0.005879882402351953

### Sequential Modelling

In [40]:
# model = Sequential()
# model.add(Dense(50, input_dim=200))
# model.add(Activation('relu'))
# model.add(Dense(1, input_dim=60))
# model.add(Activation('sigmoid'))

In [41]:
# y_pred = model.predict_proba(X_t)
# print(y_pred)

[[0.0000000e+00]
 [0.0000000e+00]
 [1.0000000e+00]
 ...
 [1.0000000e+00]
 [9.9999988e-01]
 [1.0466949e-18]]


In [25]:
# # len(X_t)
# # len(y_pred)
# difference = y_pred-y
# #len(difference)
# avg = np.average(difference)
# avg

In [24]:
# model.save('lstm_model_insult.h5')

In [50]:
# from keras.models import load_model
# # load model from single file
# model = load_model('lstm_model_toxic.h5')
# # make predictions
# yhat = model.predict(X_t, verbose=0)
# print(yhat)

### MLP

In [31]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

model = Sequential()
model.add(Dense(256, input_dim=1000, kernel_initializer='normal', activation='relu'))
# model.add(Dropout(0.25))
# model.add(Dense(64, activation='softmax'))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
#               rmsprop, adam - optimizer
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

model.fit(xx, yy,
          epochs=15,
          batch_size=64,
          validation_split=0.15, 
          callbacks=callbacks_list)
# score = model.evaluate(x_test, y_test, batch_size=128)

Train on 86150 samples, validate on 15203 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 00006: early stopping


<keras.callbacks.History at 0x25919207d30>

In [None]:


# model.fit(callbacks=callbacks_list)

