<a href="https://colab.research.google.com/github/sayarghoshroy/Hate-Speech-Detection/blob/master/perspective_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Testing Effectiveness of Perspective API features

In [2]:
import pickle
import numpy as np
import random
from sklearn.metrics import classification_report
import pandas as pd
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, neighbors
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Loading datasets

en_load = '/content/drive/My Drive/HASOC_raw_data/perspective_train/en.pickle'
en_pers = {}

ge_load = '/content/drive/My Drive/HASOC_raw_data/perspective_train/ge.pickle'
ge_pers = {}

en_data_load = '/content/drive/My Drive/2020_processed_data/en.pickle'
en_data = {}

ge_data_load = '/content/drive/My Drive/2020_processed_data/ge.pickle'
ge_data = {}

with open(en_load, 'rb') as f:
  en_pers = pickle.load(f)

with open(ge_load, 'rb') as f:
  ge_pers = pickle.load(f)

with open(en_data_load, 'rb') as f:
  en_data = pickle.load(f)

with open(ge_data_load, 'rb') as f:
  ge_data = pickle.load(f)

In [5]:
# Visualizing Data
for key in ge_pers.keys():
  print(str(key) + ": " + str(ge_pers[key]))

TOXICITY_WHOLE: [0.83040893, 0.3134107, 0.8906426, 0.7994978, 0.98874295, 0.98621744, 0.6427481, 0.12463747, 0.94322485, 0.23864032, 0.94322485, 0.575201, 0.03461841, 0.03461841, 0.9077955, 0.9077955, 0.12580645, 0.49147624, 0.03461841, 0.94724786, 0.7649919, 0.36202574, 0.27212298, 0.5065009, 0.2103691, 0.81438124, 0.8668033, 0.7385965, 0.94322485, 0.8668033, 0.21210478, 0.07836416, 0.8412408, 0.3122088, 0.94322485, 0.80903095, 0.05104512, 0.49147624, 0.84580207, 0.25878873, 0.9077955, 0.30460453, 0.9077955, 0.94322485, 0.082795374, 0.8398533, 0.9887216, 0.10356257, 0.3131685, 0.7649919, 0.9880204, 0.1334102, 0.37827945, 0.98621744, 0.06896552, 0.23618706, 0.07767565, 0.06929883, 0.03461841, 0.94322485, 0.81438124, 0.12542988, 0.073199525, 0.23355038, 0.06896552, 0.03461841, 0.9679043, 0.40260583, 0.25452536, 0.6922321, 0.94322485, 0.056031294, 0.38709155, 0.3120804, 0.03461841, 0.12580645, 0.13888781, 0.81438124, 0.81438124, 0.12580645, 0.7910673, 0.3132587, 0.06896552, 0.6349963, 0.

In [6]:
# Testing Load Correctness
ge_data.keys()

dict_keys(['tweet_id', 'task_1', 'task_2', 'hasoc_id', 'full_tweet', 'tweet_raw_text', 'hashtags', 'smiley', 'emoji', 'url', 'mentions', 'numerals', 'reserved_word', 'emotext', 'segmented_hash'])

In [7]:
x = []
y = []

# Uncomment to Run for English
# language = "EN"

# Uncomment to Run for German
language = "GE"

if language == "EN":
    data_size = len(en_data['task_1'])

    for idx in range(data_size):
      y.append(en_data['task_1'][idx])

    x_matrix = []
    for key in en_pers.keys():
      if key == "RAW_SPAN" or key == "WHOLE_SPAN":
        continue
      x_matrix.append(en_pers[key])

    x_matrix = np.asmatrix(x_matrix).T
    y = np.asmatrix(y).T
    # View Dimensions
    print("x Dimensions: " + str(np.shape(x_matrix)))
    print("y Dimensions: " + str(np.shape(y)))

elif language == "GE":
    data_size = len(ge_data['task_1'])

    for idx in range(data_size):
      y.append(ge_data['task_1'][idx])

    x_matrix = []
    for key in ge_pers.keys():
      if key == "RAW_SPAN" or key == "WHOLE_SPAN":
        continue
      x_matrix.append(ge_pers[key])

    x_matrix = np.asmatrix(x_matrix).T
    y = np.asmatrix(y).T
    # View Dimensions
    print("x Dimensions: " + str(np.shape(x_matrix)))
    print("y Dimensions: " + str(np.shape(y)))

x Dimensions: (2373, 12)
y Dimensions: (2373, 1)


In [8]:
train_X, test_X, train_Y, test_Y = model_selection.train_test_split(x_matrix, y, random_state = 42, test_size = 0.2)
train_Y = np.ravel(train_Y)
test_Y = np.ravel(test_Y)

In [9]:
# Viewing Data Shapes
print("Train X: " + str(np.shape(train_X)))
print("Train Y: " + str(np.shape(train_Y)))
print("Test X: " + str(np.shape(test_X)))
print("Test Y: " + str(np.shape(test_Y)))

Train X: (1898, 12)
Train Y: (1898,)
Test X: (475, 12)
Test Y: (475,)


In [10]:
# Sklearn Implementation of SGD with 1e6 iterations
def train(X, y, active = 'relu', sol = 'adam', learn = 'adaptive'):
    reg = MLPClassifier(alpha = 0,
                       learning_rate_init = 1e-2 * 5,
                       learning_rate = learn,
                       hidden_layer_sizes = (32, 64, 128, 256, 512, 256, 64, 32, 16, 8, 4, 1),
                       random_state = 2020,
                       activation = active,
                       max_iter = int(1e5),
                       solver = sol,
                       momentum = 0.9)
    reg.fit(X, y)
    return reg
    # reg is our trained model

def get_train_test_res(reg):
    pred_y_train = reg.predict(train_X)
    pred_y_test = reg.predict(test_X)
    target_names = ['HOF', 'NOT']

    total = 0
    matches = 0
    size = np.shape(pred_y_test)[0]
    for idx in range(size):
      if pred_y_test[idx] == test_Y[idx]:
        matches += 1
      total += 1
    print("Accuracy = " + str(matches / total * 100) + "%")
    print("")
    # Uncomment to View Full Reports
    # print(classification_report(test_Y, pred_y_test, target_names = target_names))

In [11]:
activations = ['tanh'] # , 'relu', 'identity', 'logistic']
solvers = ['sgd', 'adam', 'lbfgs']
learning_rates = ['constant', 'invscaling', 'adaptive']
model_number = 1
# On German, identity activation gives overflow problems
for active in activations:
    print("### " + str(active) + " Activation")
    for sol in solvers:
        if active == 'tanh' and sol == 'lbfgs':
          continue
        print("#### " + str(sol) + " Solver")
        for learn in learning_rates:
            model = train(train_X, train_Y, active, sol, learn)
            print(str(model_number) + ". " + " Learn Rate = " + str(learn) + " : ")
            get_train_test_res(model)
            model_number += 1

### tanh Activation
#### sgd Solver
1.  Learn Rate = constant : 
Accuracy = 81.89473684210526%

2.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

3.  Learn Rate = adaptive : 
Accuracy = 84.42105263157896%

#### adam Solver
4.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

5.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

6.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%



### Random Seed Used: 42
#### For all experiments

#Results for English

## identity Activation
### lbfgs solver
Learn Rate = constant : Accuracy = 90.02695417789758%

Learn Rate = invscaling : Accuracy = 90.02695417789758%

Learn Rate = adaptive : Accuracy = 90.02695417789758%

### sgd Solver
Learn Rate = constant : Accuracy = 89.75741239892183%

Learn Rate = invscaling : Accuracy = 88.27493261455525%

Learn Rate = adaptive : Accuracy = 89.35309973045821%

### adam Solver
Learn Rate = constant : Accuracy = 82.61455525606469%

Learn Rate = invscaling : Accuracy = 82.61455525606469%

Learn Rate = adaptive : Accuracy = 82.61455525606469%

## logistic Activation
### lbfgs Solver
Learn Rate = constant : Accuracy = 47.03504043126684%

Learn Rate = invscaling : Accuracy = 47.03504043126684%

Learn Rate = adaptive : Accuracy = 47.03504043126684%

### sgd Solver
Learn Rate = constant : Accuracy = 47.03504043126684%

Learn Rate = invscaling : Accuracy = 52.96495956873315%

Learn Rate = adaptive : Accuracy = 47.03504043126684%

### adam Solver
Learn Rate = constant : Accuracy = 47.03504043126684%

Learn Rate = invscaling : Accuracy = 47.03504043126684%

Learn Rate = adaptive : Accuracy = 47.03504043126684%

## relu Activation
### sgd Solver
Learn Rate = constant : Accuracy = 89.35309973045821%

Learn Rate = invscaling : Accuracy = 84.50134770889488%

Learn Rate = adaptive : Accuracy = 89.08355795148249%

### adam Solver
Learn Rate = constant : Accuracy = 47.03504043126684%

Learn Rate = invscaling : Accuracy = 47.03504043126684%

Learn Rate = adaptive : Accuracy = 47.03504043126684%

### lbfgs solver
Learn Rate = constant : Accuracy = 88.94878706199461%

Learn Rate = invscaling : Accuracy = 88.94878706199461%

Learn Rate = adaptive : Accuracy = 88.94878706199461%

## tanh Activation
### sgd Solver
Learn Rate = constant : Accuracy = 89.8921832884097%

Learn Rate = invscaling : Accuracy = 89.4878706199461%

Learn Rate = adaptive : Accuracy = 87.87061994609164%

### adam Solver
Learn Rate = constant : Accuracy = 47.03504043126684%

Learn Rate = invscaling : Accuracy = 47.03504043126684%

Learn Rate = adaptive : Accuracy = 47.03504043126684%

# Results for German

### tanh Activation
#### sgd Solver
1.  Learn Rate = constant : 
Accuracy = 81.89473684210526%

2.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

3.  Learn Rate = adaptive : 
Accuracy = 84.42105263157896%

#### adam Solver
4.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

5.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

6.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

### logistic Activation
#### sgd Solver
1.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

2.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

3.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

#### adam Solver
4.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

5.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

6.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

#### lbfgs Solver
7.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

8.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

9.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

### relu Activation
#### sgd Solver
1.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

2.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

3.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

#### adam Solver
4.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

5.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

6.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

#### lbfgs Solver
7.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

8.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

9.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

In [12]:
# ^_^ Thank You