In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from sklearn.metrics import classification_report, confusion_matrix

import pickle
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import optimizers

from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

Using TensorFlow backend.


In [0]:
def Create_NN_Model(No_Features=300, No_Hidden_Layers=1, No_Hidden_Neurons=4, 
                    Hidden_Activation ="relu", No_OP_Neurons=6, 
                    Output_Activation="softmax", Kernel_Initializer="random_uniform",
                    Learning_Rate=0.001, Loss='categorical_crossentropy', 
                    Metrics =['accuracy']):
  
  Optimizer = optimizers.RMSprop(lr=Learning_Rate)
  classifier = Sequential()

  ## Input Layer
  classifier.add(Dense(No_Hidden_Neurons, activation=Hidden_Activation, 
                       kernel_initializer=Kernel_Initializer, input_dim=No_Features))
  classifier.add(Dropout(0.3))
  
  ## Hidden layers
  for i in range(No_Hidden_Layers-1):
    classifier.add(Dense(No_Hidden_Neurons, activation=Hidden_Activation, 
                         kernel_initializer=Kernel_Initializer))
    classifier.add(Dropout(0.3))
    
  ## Output Layer
  classifier.add(Dense(No_OP_Neurons, activation=Output_Activation, 
                       kernel_initializer=Kernel_Initializer))
  
  classifier.summary()
  classifier.compile(optimizer =Optimizer, loss=Loss, metrics = Metrics)

  return classifier
  

In [0]:
def Train_NN(NN_classifier, train_data, feature_list=[], Label_List=[], Batch_Size=64, Epochs=100):

  # train_data = train_data.apply(pd.to_numeric, errors='coerce')
  # train_data = train_data.dropna()

  train_data.dropna()
  train_data = pd.DataFrame(np.nan_to_num(np.array(train_data)), columns = train_data.columns)
  for label in Label_List:
    train_data[label] = pd.to_numeric(train_data[label], errors='coerce')
  train_data = train_data.dropna(subset=Label_List)
  
  train_features = train_data[feature_list] 
  # train_features = (train_features-train_features.mean())/train_features.std()   
  train_labels = train_data[Label_List]
  train_labels = train_labels.astype('int')

  NN_classifier.fit(train_features,train_labels, batch_size=Batch_Size, epochs=Epochs)

  eval_model=NN_classifier.evaluate(train_features, train_labels)
  print("Loss: ", eval_model[0])
  print("Accuracy of the model: ", eval_model[1])

  return NN_classifier


In [0]:
def Store_Trained_NN(NN_obj, Filepath):
  
  with open(Filepath, "wb") as file:
    pickle.dump(NN_obj, file)

def Load_Trained_NN(Filepath):
  
  with open(Filepath, "rb") as file:
    NN_obj = pickle.load(file)

  return NN_obj

In [0]:
def Evaluate_NN(test_data, NN_Model_FilePath, feature_list=[], Label_List=[], threshold=0.5):
  
  test_data = test_data.fillna(1e-10)

  test_data.dropna()
  test_data = pd.DataFrame(np.nan_to_num(np.array(test_data)), columns = test_data.columns)
  for label in Label_List:
    test_data[label] = pd.to_numeric(test_data[label], errors='coerce')
  test_data = test_data.dropna(subset=Label_List)

  test_features = test_data[feature_list]
  test_labels = test_data[Label_List]
  test_labels = test_labels.astype('int')

  NN_obj = Load_Trained_NN(NN_Model_FilePath) 
  predictions = NN_obj.predict(test_features)

  print("test_labels shape: ", test_labels.shape)
  print("predictions: ", predictions.shape)
 
  loss = log_loss(test_labels,predictions)
  print("Log_loss : {}".format(loss))
  predictions = np.round(predictions)
  loss = hamming_loss(test_labels,predictions)
  print("Hamming_loss : {}".format(loss*100))
  accuracy = accuracy_score(test_labels,predictions)
  print("Accuracy : {}".format(accuracy*100))


In [0]:
## GLOVE EMBEDDINGS

Column_List = [ "id", "comment_text"]
Label_List = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
Vector_Size = 300
Embedding_Cols = [str(i) for i in range(Vector_Size)]
Column_List.extend(Embedding_Cols)
Column_List.extend(Label_List)

Train_Embedding_FilePath = "/content/drive/My Drive/Train_Glove_Embeddings.csv"
Test_Embedding_FilePath = "/content/drive/My Drive/Test_Glove_Embeddings.csv"
NN_Model_FilePath = "/content/NN_Glove_Train_Model.pkl"

test_data = pd.read_csv(Test_Embedding_FilePath, usecols=Column_List)
train_data = pd.read_csv(Train_Embedding_FilePath, usecols=Column_List)
train_data = train_data.fillna(1e-10)

X_train = train_data[Embedding_Cols]
Y_train = train_data[Label_List]
Y_train = Y_train.astype('int')

X_test = test_data[Embedding_Cols]
Y_test = test_data[Label_List]
Y_test = Y_test.astype('int')


In [0]:
## CROSS VALIDATION
%%capture
#define parameters for using in param grid
hidden_nodes = [16, 32, 64] # number of nodes in the hidden layer
learning_rates = [0.001, 0.002, 0.003] # learning rate, default = 0.001
epochs = [50, 100]
batches = [64, 128]

model = KerasClassifier(build_fn=Create_NN_Model)

#start fitting process
param_grid = dict(epochs=epochs, batch_size=batches, No_Hidden_Neurons=hidden_nodes, Learning_Rate=learning_rates)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1,refit=True,verbose=2)
grid_result = grid.fit(X_train, Y_train)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where










In [0]:
print(grid_result)
print('Best score : {}'.format(grid.best_score_))
print('Best params : {}'.format(grid.best_params_))

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x7f28d7938748>,
             iid='warn', n_jobs=1,
             param_grid={'Learning_Rate': [0.001, 0.002, 0.003],
                         'No_Hidden_Neurons': [16, 32, 64],
                         'batch_size': [64, 128], 'epochs': [50, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)
Best score : 0.9941656065325153
Best params : {'Learning_Rate': 0.003, 'No_Hidden_Neurons': 16, 'batch_size': 64, 'epochs': 100}


In [0]:
## Store cross validation result in a file
import pickle
with open("/content/drive/My Drive/NN_CV_Result.pkl", "wb") as file:
  pickle.dump(grid, file)

In [0]:
predictions = grid.predict(X_test)
predict = grid.predict_proba(X_test)

In [0]:
predict_onehot = np.empty(predict.shape)
for i in range(predict.shape[0]):
  for j in range(predict.shape[1]):
    if predict[i][j] < 0.5:
      predict_onehot[i][j] = 0
    else:
      predict_onehot[i][j] = 1

In [0]:
#calculate score
loss = log_loss(Y_test,predict_onehot)
print("Log_loss : {}".format(loss))
predict = np.round(predict)
loss = hamming_loss(Y_test,predict_onehot)
print("Hamming_loss : {}".format(loss*100))
accuracy = accuracy_score(Y_test,predict_onehot)
print("Accuracy : {}".format(accuracy*100))

Log_loss : 1.1222380420845148
Hamming_loss : 5.619880648984258
Accuracy : 79.49847402699557
