In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from sklearn.metrics import classification_report, confusion_matrix

import pickle
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

In [0]:
def loadData(path):
  data = pd.read_csv(path)
  data = data.dropna()
  print("Dimensions of dataset:", data.shape)
  print(data.columns)
  return data

In [4]:
#Read word embeddings for train data
gloveEmbeddedTrainData = loadData("/content/drive/My Drive/Train_Glove_Embeddings.csv")

#Read word embeddings for test data
gloveEmbeddedTestData = loadData("/content/drive/My Drive/Test_Glove_Embeddings.csv")

Dimensions of dataset: (159468, 309)
Index(['Unnamed: 0', 'id', 'comment_text', '0', '1', '2', '3', '4', '5', '6',
       ...
       '296', '297', '298', '299', 'toxic', 'severe_toxic', 'obscene',
       'threat', 'insult', 'identity_hate'],
      dtype='object', length=309)
Dimensions of dataset: (63380, 309)
Index(['Unnamed: 0', 'id', 'comment_text', '0', '1', '2', '3', '4', '5', '6',
       ...
       '296', '297', '298', '299', 'toxic', 'severe_toxic', 'obscene',
       'threat', 'insult', 'identity_hate'],
      dtype='object', length=309)


In [5]:
#Drop unnecessary columns
gloveEmbeddedTrainData = gloveEmbeddedTrainData.drop(['Unnamed: 0'], axis=1)
print(gloveEmbeddedTrainData.columns)

gloveEmbeddedTestData = gloveEmbeddedTestData.drop(['Unnamed: 0'], axis=1)
print(gloveEmbeddedTestData.columns)

Index(['id', 'comment_text', '0', '1', '2', '3', '4', '5', '6', '7',
       ...
       '296', '297', '298', '299', 'toxic', 'severe_toxic', 'obscene',
       'threat', 'insult', 'identity_hate'],
      dtype='object', length=308)
Index(['id', 'comment_text', '0', '1', '2', '3', '4', '5', '6', '7',
       ...
       '296', '297', '298', '299', 'toxic', 'severe_toxic', 'obscene',
       'threat', 'insult', 'identity_hate'],
      dtype='object', length=308)


In [0]:
def trainRandomForest(trainData, noOfTrees):
  X = trainData[trainData.columns[2:302]]
  Y = trainData[trainData.columns[302:]]
  clf = RandomForestClassifier(n_estimators=noOfTrees, random_state=0)
  clf.fit(X, Y)
  return clf

In [0]:
def testRandomForest(testData, model):
  X = testData[testData.columns[2:302]]
  print("testData size:", X.shape)
  Y = testData[testData.columns[302:]]
  prob = model.predict_proba(X)
  predictions = np.zeros((gloveEmbeddedTestData.shape[0], 6))
  for i in range(len(prob)):
    x = prob[i]
    for j in range(len(x)):
      t = np.argmax(x[j])
      predictions[j][i] = t

  # incorrect = 0
  # for i in range(len(predictions)):
  #   sample = predictions[i]
  #   for j in range(len(sample)):
  #     if sample[j] != Y.iloc[i, j]:
  #       incorrect += 1
  # accuracy = len(predictions) - incorrect
  # accuracy = accuracy/len(predictions)

  #calculate score
  loss = log_loss(Y,predictions)
  print("Log_loss : {}".format(loss))
  predictions = np.round(predictions)
  loss = hamming_loss(Y,predictions)
  print("Hamming_loss : {}".format(loss*100))
  accuracy = accuracy_score(Y,predictions)
  print("Accuracy : {}".format(accuracy*100))


In [0]:
#Training Random Forest
randomForest = trainRandomForest(gloveEmbeddedTrainData, 150)

In [9]:
#Testing Random Forest
testRandomForest(gloveEmbeddedTestData, randomForest)


testData size: (63380, 300)
Log_loss : 0.42301236212709503
Hamming_loss : 3.9047543915009997
Accuracy : 89.65762070053644
