<a href="https://colab.research.google.com/github/sayarghoshroy/Hate-Speech-Detection/blob/master/perspective_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Testing Effectiveness of Perspective API features

In [2]:
import pickle
import numpy as np
import random
from sklearn.metrics import classification_report
import pandas as pd
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, neighbors
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Loading datasets

en_load = '/content/drive/My Drive/HASOC_raw_data/perspective_train/en.pickle'
en_pers = {}

ge_load = '/content/drive/My Drive/HASOC_raw_data/perspective_train/ge.pickle'
ge_pers = {}

en_data_load = '/content/drive/My Drive/2020_processed_data/en.pickle'
en_data = {}

ge_data_load = '/content/drive/My Drive/2020_processed_data/ge.pickle'
ge_data = {}

with open(en_load, 'rb') as f:
  en_pers = pickle.load(f)

with open(ge_load, 'rb') as f:
  ge_pers = pickle.load(f)

with open(en_data_load, 'rb') as f:
  en_data = pickle.load(f)

with open(ge_data_load, 'rb') as f:
  ge_data = pickle.load(f)

In [5]:
# Visualizing Data
for key in ge_pers.keys():
  print(str(key))
  # Uncomment to visualize actual values
  # print(str(key) + ": " + str(ge_pers[key]))

TOXICITY_WHOLE
TOXICITY_RAW
SEVERE_TOXICITY_WHOLE
SEVERE_TOXICITY_RAW
IDENTITY_ATTACK_WHOLE
IDENTITY_ATTACK_RAW
INSULT_WHOLE
INSULT_RAW
PROFANITY_WHOLE
PROFANITY_RAW
THREAT_WHOLE
THREAT_RAW


In [6]:
# Testing Load Correctness
ge_data.keys()

dict_keys(['tweet_id', 'task_1', 'task_2', 'hasoc_id', 'full_tweet', 'tweet_raw_text', 'hashtags', 'smiley', 'emoji', 'url', 'mentions', 'numerals', 'reserved_word', 'emotext', 'segmented_hash'])

In [7]:
x = []
y_a = []
y_b = []

none_cnt = 0
prfn_cnt = 0
hate_cnt = 0
offn_cnt = 0

not_cnt = 0
hof_cnt = 0

# Uncomment to Run for English
language = "EN"

# Uncomment to Run for German
# language = "GE"

if language == "EN":
    data_size = len(en_data['task_1'])

    for idx in range(data_size):
      y_a.append(en_data['task_1'][idx])
      y_b.append(en_data['task_2'][idx])

    x_matrix = []
    for key in en_pers.keys():
      if key == "RAW_SPAN" or key == "WHOLE_SPAN":
        continue
      x_matrix.append(en_pers[key])

    x_matrix = np.asmatrix(x_matrix).T
    y_a = np.asmatrix(y_a).T
    y_b = np.asmatrix(y_b).T
    # View Dimensions
    print("x Dimensions: " + str(np.shape(x_matrix)))
    print("y Dimensions: " + str(np.shape(y_a)))

elif language == "GE":
    data_size = len(ge_data['task_1'])

    for idx in range(data_size):
      checker = ge_data['task_1'][idx]
      if checker == 'NOT':
        not_cnt += 1
      if checker == 'HOF':
        hof_cnt += 1
      y_a.append(ge_data['task_1'][idx])

      checker = ge_data['task_2'][idx]
      if checker == 'NONE':
        none_cnt += 1
      if checker == 'OFFN':
        offn_cnt += 1
      if checker == 'HATE':
        hate_cnt += 1
      if checker == 'PRFN':
        prfn_cnt += 1
      y_b.append(ge_data['task_2'][idx])

    x_matrix = []
    for key in ge_pers.keys():
      if key == "RAW_SPAN" or key == "WHOLE_SPAN":
        continue
      x_matrix.append(ge_pers[key])

    x_matrix = np.asmatrix(x_matrix).T
    y_a = np.asmatrix(y_a).T
    y_b = np.asmatrix(y_b).T
    # View Dimensions
    print("x Dimensions: " + str(np.shape(x_matrix)))
    print("y Dimensions: " + str(np.shape(y_a)))
    print("y Dimensions: " + str(np.shape(y_b)))

x Dimensions: (3708, 18)
y Dimensions: (3708, 1)


In [8]:
print("not_cnt: " + str(not_cnt))
print("hof_cnt: " + str(hof_cnt))

print("none_cnt: " + str(none_cnt))
print("hate_cnt: " + str(hate_cnt))
print("prfn_cnt: " + str(prfn_cnt))
print("offn_cnt: " + str(offn_cnt))

not_cnt: 0
hof_cnt: 0
none_cnt: 0
hate_cnt: 0
prfn_cnt: 0
offn_cnt: 0


In [9]:
print(y_b.T)

[['PRFN' 'PRFN' 'NONE' ... 'NONE' 'PRFN' 'NONE']]


In [10]:
train_X, test_X, train_Y_a, test_Y_a, train_Y_b, test_Y_b = model_selection.train_test_split(x_matrix, y_a, y_b, random_state = 42, test_size = 0.2)
train_Y_a = np.ravel(train_Y_a)
test_Y_a = np.ravel(test_Y_a)
train_Y_b = np.ravel(train_Y_b)
test_Y_b = np.ravel(test_Y_b)

In [11]:
# Viewing Data Shapes
print("Train X: " + str(np.shape(train_X)))
print("Train Y: " + str(np.shape(train_Y_a)))
print("Test X: " + str(np.shape(test_X)))
print("Test Y: " + str(np.shape(test_Y_a)))

Train X: (2966, 18)
Train Y: (2966,)
Test X: (742, 18)
Test Y: (742,)


In [12]:
train_mean = np.mean(train_X, axis = 0)
train_var = np.var(train_X, axis = 0)
# Data Normalization
# Note that we are only observing our training set
train_X = (train_X -  train_mean) / np.sqrt(train_var)
test_X = (test_X - train_mean) / np.sqrt(train_var)

In [16]:
def train(X, y_a, y_b, active = 'relu', sol = 'adam', learn = 'adaptive'):
    cl_a = MLPClassifier(alpha = 0,
                      # learning_rate_init = 1e-2 * 5,
                      learning_rate = learn,
                      hidden_layer_sizes = (32, 64, 128, 256, 512, 256, 64, 32, 16, 8, 4, 1),
                      random_state = 2020,
                      activation = active,
                      max_iter = int(1e4),
                      solver = sol,
                      batch_size = 800,
                      momentum = 0.9)
    
    cl_b = MLPClassifier(alpha = 0,
                      # learning_rate_init = 1e-2 * 5,
                      learning_rate = learn,
                      hidden_layer_sizes = (32, 64, 128, 256, 512, 256, 64, 32, 16),
                      random_state = 2020,
                      activation = active,
                      max_iter = int(1e4),
                      solver = sol,
                      batch_size = 800,
                      momentum = 0.9)
    cl_a.fit(X, y_a)
    cl_b.fit(X, y_b)
    return [cl_a, cl_b]

def get_test_res(cl_a, cl_b):
    pred_y_test_a = cl_a.predict(test_X)
    pred_y_test_b = cl_b.predict(test_X)
    target_names = ['NONE', 'PRFN', 'HATE', 'OFFN']

    total = 0
    matches = 0
    size = np.shape(pred_y_test_a)[0]
    for idx in range(size):
      if pred_y_test_a[idx] == test_Y_a[idx]:
          matches += 1
      if pred_y_test_a[idx] == 'NOT':
          pred_y_test_b[idx] = 'NONE'
      total += 1
    print("Accuracy = " + str(matches / total * 100) + "%")
    print("")
    # Uncomment to View Full Reports
    print(classification_report(test_Y_b, pred_y_test_b, target_names = target_names))
    print("")

In [17]:
# activations = ['identity', 'tanh', 'logistic', 'relu']
activations = ['tanh', 'identity']
solvers = ['adam', 'lbfgs', 'sgd']
# learning_rates = ['adaptive', 'constant', 'invscaling']
learning_rates = ['adaptive']

# Going ahead with the better performing schemes

# On German, identity activation + sgd solver gives overflow problems
for active in activations:
    model_number = 1
    print("### " + str(active) + " Activation")
    for sol in solvers:
        if active == 'tanh' and sol == 'lbfgs':
          continue
        if language == "GE" and active == "identity" and sol == "sgd":
          continue
        if language == "GE" and active == "relu" and sol == "lbfgs":
          continue
        print("#### " + str(sol) + " Solver")
        for learn in learning_rates:
            model_a, model_b = train(train_X, train_Y_a, train_Y_b, active, sol, learn)
            print(str(model_number) + ". ", end = "")
            get_test_res(model_a, model_b)
            model_number += 1

### tanh Activation
#### adam Solver
1. Accuracy = 89.08355795148249%

              precision    recall  f1-score   support

        NONE       0.14      0.06      0.08        36
        PRFN       0.82      0.92      0.86       349
        HATE       0.28      0.25      0.26        65
        OFFN       0.85      0.81      0.83       292

    accuracy                           0.77       742
   macro avg       0.52      0.51      0.51       742
weighted avg       0.75      0.77      0.76       742


#### sgd Solver
2. Accuracy = 89.75741239892183%

              precision    recall  f1-score   support

        NONE       0.00      0.00      0.00        36
        PRFN       0.87      0.93      0.90       349
        HATE       0.32      0.09      0.14        65
        OFFN       0.78      0.93      0.85       292

    accuracy                           0.81       742
   macro avg       0.49      0.49      0.47       742
weighted avg       0.74      0.81      0.77       742


### ide

# ***A Massive Results Dump...***

### Random Seed Used: 42
#### For all experiments

#Results for English: Un-normalized, Last HL 1

## identity Activation
### lbfgs solver
1. Learn Rate = constant : Accuracy = 90.02695417789758%

2. Learn Rate = invscaling : Accuracy = 90.02695417789758%

3. Learn Rate = adaptive : Accuracy = 90.02695417789758%

### sgd Solver
4. Learn Rate = constant : Accuracy = 89.75741239892183%

5. Learn Rate = invscaling : Accuracy = 88.27493261455525%

6. Learn Rate = adaptive : Accuracy = 89.35309973045821%

### adam Solver
7. Learn Rate = constant : Accuracy = 82.61455525606469%

8. Learn Rate = invscaling : Accuracy = 82.61455525606469%

9. Learn Rate = adaptive : Accuracy = 82.61455525606469%

## logistic Activation
### lbfgs Solver
1. Learn Rate = constant : Accuracy = 47.03504043126684%

2. Learn Rate = invscaling : Accuracy = 47.03504043126684%

3. Learn Rate = adaptive : Accuracy = 47.03504043126684%

### sgd Solver
4. Learn Rate = constant : Accuracy = 47.03504043126684%

5. Learn Rate = invscaling : Accuracy = 52.96495956873315%

6. Learn Rate = adaptive : Accuracy = 47.03504043126684%

### adam Solver
7. Learn Rate = constant : Accuracy = 47.03504043126684%

8. Learn Rate = invscaling : Accuracy = 47.03504043126684%

9. Learn Rate = adaptive : Accuracy = 47.03504043126684%

## relu Activation
### sgd Solver
1. Learn Rate = constant : Accuracy = 89.35309973045821%

2. Learn Rate = invscaling : Accuracy = 84.50134770889488%

3. Learn Rate = adaptive : Accuracy = 89.08355795148249%

### adam Solver
4. Learn Rate = constant : Accuracy = 47.03504043126684%

5. Learn Rate = invscaling : Accuracy = 47.03504043126684%

6. Learn Rate = adaptive : Accuracy = 47.03504043126684%

### lbfgs solver
7. Learn Rate = constant : Accuracy = 88.94878706199461%

8. Learn Rate = invscaling : Accuracy = 88.94878706199461%

9. Learn Rate = adaptive : Accuracy = 88.94878706199461%

## tanh Activation
### sgd Solver
1. Learn Rate = constant : Accuracy = 89.8921832884097%

2. Learn Rate = invscaling : Accuracy = 89.4878706199461%

3. Learn Rate = adaptive : Accuracy = 87.87061994609164%

### adam Solver
4. Learn Rate = constant : Accuracy = 47.03504043126684%

5. Learn Rate = invscaling : Accuracy = 47.03504043126684%

6. Learn Rate = adaptive : Accuracy = 47.03504043126684%

# Results for German: : Un-normalized, Last HL 1

### tanh Activation
#### sgd Solver
1.  Learn Rate = constant : 
Accuracy = 81.89473684210526%

2.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

3.  Learn Rate = adaptive : 
Accuracy = 84.42105263157896%

#### adam Solver
4.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

5.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

6.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

### logistic Activation
#### sgd Solver
1.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

2.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

3.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

#### adam Solver
4.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

5.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

6.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

#### lbfgs Solver
7.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

8.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

9.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

### relu Activation
#### sgd Solver
1.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

2.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

3.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

#### adam Solver
4.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

5.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

6.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

#### lbfgs Solver
7.  Learn Rate = constant : 
Accuracy = 70.94736842105263%

8.  Learn Rate = invscaling : 
Accuracy = 70.94736842105263%

9.  Learn Rate = adaptive : 
Accuracy = 70.94736842105263%

### identity Activation
#### adam Solver
1.  Learn Rate = constant : 
Accuracy = 84.0%

2.  Learn Rate = invscaling : 
Accuracy = 84.0%

3.  Learn Rate = adaptive : 
Accuracy = 84.0%

#### lbfgs Solver
4.  Learn Rate = constant : 
Accuracy = 82.94736842105263%

5.  Learn Rate = invscaling : 
Accuracy = 82.94736842105263%

6.  Learn Rate = adaptive : 
Accuracy = 82.94736842105263%

# German Post Normalization With Task 1 Info

### tanh Activation
#### adam Solver
1. Accuracy = 81.89473684210526%

              precision    recall  f1-score   support

        NONE       0.75      0.11      0.19        28
        PRFN       0.81      0.97      0.88       337
        HATE       0.44      0.22      0.29        32
        OFFN       0.73      0.49      0.58        78

    accuracy                           0.79       475
   macro avg       0.68      0.45      0.49       475
weighted avg       0.77      0.79      0.75       475


#### sgd Solver
2. Accuracy = 85.47368421052632%

              precision    recall  f1-score   support

        NONE       0.47      0.29      0.36        28
        PRFN       0.87      0.92      0.89       337
        HATE       0.57      0.12      0.21        32
        OFFN       0.62      0.77      0.69        78

    accuracy                           0.80       475
   macro avg       0.63      0.52      0.54       475
weighted avg       0.79      0.80      0.78       475


### identity Activation
#### adam Solver
1. Accuracy = 84.0%

              precision    recall  f1-score   support

        NONE       0.50      0.07      0.12        28
        PRFN       0.84      0.93      0.89       337
        HATE       1.00      0.03      0.06        32
        OFFN       0.62      0.78      0.69        78

    accuracy                           0.80       475
   macro avg       0.74      0.45      0.44       475
weighted avg       0.80      0.80      0.75       475


#### lbfgs Solver
2. Accuracy = 83.15789473684211%

              precision    recall  f1-score   support

        NONE       0.40      0.07      0.12        28
        PRFN       0.85      0.92      0.88       337
        HATE       0.75      0.09      0.17        32
        OFFN       0.60      0.77      0.67        78

    accuracy                           0.79       475
   macro avg       0.65      0.46      0.46       475
weighted avg       0.77      0.79      0.75       475


# English Post normalization with task 1 info

### tanh Activation
#### adam Solver
1. Accuracy = 89.08355795148249%

              precision    recall  f1-score   support

        NONE       0.14      0.06      0.08        36
        PRFN       0.82      0.92      0.86       349
        HATE       0.28      0.25      0.26        65
        OFFN       0.85      0.81      0.83       292
        
    accuracy                           0.77       742
   macro avg       0.52      0.51      0.51       742
weighted avg       0.75      0.77      0.76       742


#### sgd Solver
2. Accuracy = 89.75741239892183%

              precision    recall  f1-score   support

        NONE       0.00      0.00      0.00        36
        PRFN       0.87      0.93      0.90       349
        HATE       0.32      0.09      0.14        65
        OFFN       0.78      0.93      0.85       292

    accuracy                           0.81       742
   macro avg       0.49      0.49      0.47       742
weighted avg       0.74      0.81      0.77       742


### identity Activation
#### adam Solver
1. Accuracy = 89.35309973045821%

              precision    recall  f1-score   support

        NONE       0.29      0.06      0.09        36
        PRFN       0.88      0.93      0.90       349
        HATE       0.38      0.15      0.22        65
        OFFN       0.80      0.93      0.86       292

    accuracy                           0.82       742
   macro avg       0.59      0.52      0.52       742
weighted avg       0.77      0.82      0.79       742


#### lbfgs Solver
2. Accuracy = 89.8921832884097%

              precision    recall  f1-score   support

        NONE       0.33      0.03      0.05        36
        PRFN       0.87      0.92      0.90       349
        HATE       0.28      0.08      0.12        65
        OFFN       0.78      0.94      0.85       292

    accuracy                           0.81       742
   macro avg       0.57      0.49      0.48       742
weighted avg       0.76      0.81      0.77       742


#### sgd Solver
3. Accuracy = 89.4878706199461%

              precision    recall  f1-score   support

        NONE       0.67      0.06      0.10        36
        PRFN       0.87      0.92      0.90       349
        HATE       0.33      0.09      0.14        65
        OFFN       0.78      0.94      0.85       292

    accuracy                           0.81       742
   macro avg       0.66      0.50      0.50       742
weighted avg       0.78      0.81      0.77       742


In [15]:
# ^_^ Thank You