In [1]:
import pickle
import time

import pandas as pd
import numpy as np

from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import optimizers

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def Create_NN_Model(No_Features=100, No_Hidden_Layers=1, No_Hidden_Neurons=30, 
                    Hidden_Activation ="relu", No_OP_Neurons=1, 
                    Output_Activation="sigmoid", Kernel_Initializer="ones",
                    Optimizer="rmsprop", Loss='binary_crossentropy', Metrics =['accuracy']):
  
    classifier = Sequential()

  ## Input Layer
    classifier.add(Dense(No_Hidden_Neurons, activation=Hidden_Activation, 
                       kernel_initializer=Kernel_Initializer, input_dim=No_Features))
  
  ## Hidden layers
    for i in range(No_Hidden_Layers):
        classifier.add(Dense(No_Hidden_Neurons, activation=Hidden_Activation, 
                         kernel_initializer=Kernel_Initializer))
    
  ## Output Layer
    classifier.add(Dense(No_OP_Neurons, activation=Output_Activation, 
                       kernel_initializer=Kernel_Initializer))
  
    classifier.compile(optimizer =Optimizer, loss=Loss, metrics = Metrics)

    return classifier

In [3]:
def Train_NN(NN_classifier, train_data, feature_list=[], Label_Col="Label", Batch_Size=64, Epochs=10):

    train_data.dropna()
    train_data = pd.DataFrame(np.nan_to_num(np.array(train_data)), columns = train_data.columns)
    train_data[Label_Col] = pd.to_numeric(train_data[Label_Col], errors='coerce')
    train_data = train_data.dropna(subset=[Label_Col])
  
    train_features = train_data[feature_list]    
    train_labels = train_data[Label_Col]
    train_labels = train_labels.astype('int')

    NN_classifier.fit(train_features,train_labels, batch_size=Batch_Size, epochs=Epochs)

    eval_model=NN_classifier.evaluate(train_features, train_labels)
    print("Loss: ", eval_model[0])
    print("Accuracy of the model: ", eval_model[1])

    return NN_classifier

In [4]:
## Store trained model in a file to reuse in other codes without training again on same data

def Store_Trained_NN(NN_obj, Filepath):
  
    with open(Filepath, "wb") as file:
        pickle.dump(NN_obj, file)

In [5]:
## Load stored trained model and returns random forest model object

def Load_Trained_NN(Filepath):
  
    with open(Filepath, "rb") as file:
        NN_obj = pickle.load(file)

    return NN_obj

In [6]:
def Evaluate_NN(test_data, NN_Model_FilePath, feature_list=[], Label_Col="label", threshold=0.5):
  
    test_data.dropna()
    test_data = pd.DataFrame(np.nan_to_num(np.array(test_data)),  columns = test_data.columns)
    test_data[Label_Col] = pd.to_numeric(test_data[Label_Col], errors='coerce')
    test_data = test_data.dropna(subset=[Label_Col])

    test_features = test_data[feature_list]
    test_labels = test_data[Label_Col]
    test_labels = test_labels.astype('int')

    NN_obj = Load_Trained_NN(NN_Model_FilePath) 
    predictions = NN_obj.predict(test_features)
    predictions_list = [int(p[0]) for p in predictions]
      
    true_subjective = 0
    true_objective = 0
    false_subjective = 0
    false_objective = 0
    
    for i in range(len(predictions_list)):
        if predictions_list[i] >= threshold:
            predictions_list[i] = 1
            if test_labels[i] == 1:
                true_subjective += 1
            else:
                false_subjective += 1
        else:
            predictions_list[i] = 0
            if test_labels[i] == 0:
                true_objective += 1
            else:
                false_objective += 1
      
    errors = abs(predictions_list - test_labels)

  # Calculate mean absolute error (MAE)
    MAE = round(np.mean(errors), 2)
  
  ## Confusion Matrix and Classification Report
    Confusion_Matrix = confusion_matrix(test_labels,predictions_list)
    Report = classification_report(test_labels,predictions_list)
    
    print("True Subjective : ", true_subjective)
    print("True Objective : ", true_objective)
    print("False Subjective : ", false_subjective)
    print("False Objective : ", false_objective)
    
    print("Accuracy: ", (true_subjective+true_objective)/(true_subjective+true_objective+false_subjective+false_objective)*100)
  
    return MAE, Confusion_Matrix, Report

In [8]:
Column_List = ["embedding"]
Label_Col = "label"
Vector_Size = 100
Embedding_Cols = ["emb"+str(i) for i in range(Vector_Size)]
# Data_Columns = Embedding_Cols
# Data_Columns = Data_Columns.append(Label_Col)
Column_List.append(Label_Col)

Folder_Path = "/Users/pranjali/Downloads/Wiki_BiasDetection"

Train_Embedding_FilePath = Folder_Path + "/Data/Task1_FinalData/Embeddings/train_emb_full.csv"
Test_Embedding_FilePath = Folder_Path + "/Data/Task1_FinalData/Embeddings/test_emb_full.csv"
NN_Model_FilePath =  Folder_Path + "/Saved_Models/NN/NN_Task1_Trained_Model.pkl"

train_data_raw = pd.read_csv(Train_Embedding_FilePath, usecols=Column_List)
test_data_raw = pd.read_csv(Test_Embedding_FilePath, usecols=Column_List)

In [9]:
def Get_Embeddings(data):
    
    Col_List = ["emb"+str(i) for i in range(100)]
    Col_List.append("label")
    
    Embeddings = []
    
    for i in range(data.shape[0]):
        row = data.iloc[i]
        embedding_str = row["embedding"][1:-1]
        embedding_list = embedding_str.split(',')
        embedding = [float(s) for s in embedding_list]
        embedding.append(int(row["label"]))
        Embeddings.append(embedding)
        
    return pd.DataFrame(Embeddings, columns=Col_List)

train_data = Get_Embeddings(train_data_raw)
test_data = Get_Embeddings(test_data_raw)

In [10]:
print("train_data shape: ", train_data.shape)
print("test_data shape: ", test_data.shape)

train_data shape:  (234311, 101)
test_data shape:  (58579, 101)


In [11]:
train_data = train_data.sample(frac=1)

In [12]:
## Training Phase
start_time = time.time()

NN_Classifier = Create_NN_Model()
NN_obj = Train_NN(NN_Classifier, train_data, Embedding_Cols, Label_Col)
Store_Trained_NN(NN_obj, NN_Model_FilePath)

end_time = time.time()
print("Time required for training: ", end_time - start_time )

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss:  0.6539763518165213
Accuracy of the model:  0.6420142650604248
Time required for training:  48.643601179122925


In [13]:
start_time = time.time()

MAE, Confusion_Matrix, Report = Evaluate_NN(test_data, NN_Model_FilePath, Embedding_Cols, Label_Col, 0.5)

print("MEAN ABSOLUTE ERROR: ", MAE)

print("\n")
print("============ CONFUSION MATRIX ===============")
print(Confusion_Matrix)

print("\n")
print("============ CLASSIFICATION REPORT ==============")
print(Report)

tn, fp, fn, tp = Confusion_Matrix.ravel()
Accuracy = (tn+tp)/(tn + fp + fn + tp)

print("Accuracy: ", Accuracy*100)

end_time = time.time()
print("Time required for testing: ", end_time - start_time )

True Subjective :  41
True Objective :  20967
False Subjective :  4
False Objective :  37567
Accuracy:  35.86268116560542
MEAN ABSOLUTE ERROR:  0.64


[[20967     4]
 [37567    41]]


              precision    recall  f1-score   support

           0       0.36      1.00      0.53     20971
           1       0.91      0.00      0.00     37608

   micro avg       0.36      0.36      0.36     58579
   macro avg       0.63      0.50      0.26     58579
weighted avg       0.71      0.36      0.19     58579

Accuracy:  35.86268116560542
Time required for testing:  1.695504903793335


In [14]:
## Cross Validation

train_data.dropna()
train_data = pd.DataFrame(np.nan_to_num(np.array(train_data)), columns = train_data.columns)
train_data[Label_Col] = pd.to_numeric(train_data[Label_Col], errors='coerce')
train_data = train_data.dropna(subset=[Label_Col])

train_features = train_data[Embedding_Cols]    
train_labels = train_data[Label_Col]
train_labels = train_labels.astype('int')

# create the sklearn model for the network
model_CV = KerasClassifier(build_fn=Create_NN_Model, verbose=1)

# we choose the initializers that came at the top in our previous cross-validation!!
# kernel_initializer = ['random_uniform']
# No_Hidden_Layers = [1, 2]
# No_Hidden_Neurons= [10, 30, 50]
# optimizer = ['adam', 'rmsprop']

# batches = [64*x for x in range(1, 3)]
# epochs = [50, 100, 150]

kernel_initializer = ['random_uniform']
No_Hidden_Layers = [1, 2]
No_Hidden_Neurons= [10, 30, 50]
optimizer = ['adam', 'rmsprop']

batches = [64*x for x in range(1, 3)]
epochs = [20, 50]

## We can also try different learning rates for optimizers. 
## For this create different objects of optimizers with different learning rates and pass list of objects

# grid search for initializer, batch size and number of epochs
param_grid = dict(epochs=epochs, batch_size=batches, Kernel_Initializer=kernel_initializer, 
                 No_Hidden_Layers=No_Hidden_Layers, Optimizer=optimizer)
grid = GridSearchCV(estimator=model_CV, param_grid=param_grid, cv=3, n_jobs=4, refit=True, verbose=2)
grid_result = grid.fit(train_features, train_labels)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 24.1min
[Parallel(n_jobs=4)]: Done  48 out of  48 | elapsed: 35.0min finished


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
# print results of cross validation

print(f'Best Accuracy for {grid_result.best_score_} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f' mean={mean:.4}, std={stdev:.4} using {param}')

Best Accuracy for 0.6920332449449401 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 1, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 50}
 mean=0.6845, std=0.003433 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 1, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 20}
 mean=0.692, std=0.00111 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 1, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 50}
 mean=0.6845, std=0.002672 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 1, 'Optimizer': 'adam', 'batch_size': 128, 'epochs': 20}
 mean=0.6899, std=0.0005892 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 1, 'Optimizer': 'adam', 'batch_size': 128, 'epochs': 50}
 mean=0.6847, std=0.003373 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 1, 'Optimizer': 'rmsprop', 'batch_size': 64, 'epochs': 20}
 mean=0.6917, std=0.002039 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_L

In [16]:
# ## Train again using best parameter values identified by cross validation and store the trained model

# ## Training Phase
# NN_Classifier = Create_NN_Model(No_Hidden_Layers=grid_result.best_params_.No_Hidden_Layers, No_Hidden_Neurons=grid_result.best_params_.No_Hidden_Neurons, Kernel_Initializer=grid_result.best_params_.Kernel_Initializer, Optimizer=grid_result.best_params_.Optimizer)
# NN_obj = Train_NN(NN_Classifier, train_data, Embedding_Cols, Label_Col, Batch_Size=grid_result.best_params_.batch_size, Epochs=grid_result.best_params_.epochs)
# Store_Trained_NN(NN_obj, NN_Model_FilePath)

## Store models trained with best parameters
Store_Trained_NN(grid, NN_Model_FilePath)

## Evaluation of above model on validation data
MAE, Confusion_Matrix, Report = Evaluate_NN(test_data, NN_Model_FilePath, Embedding_Cols, Label_Col, 0.5)

print("MEAN ABSOLUTE ERROR: ", MAE)

print("\n")
print("============ CONFUSION MATRIX ===============")
print(Confusion_Matrix)

print("\n")
print("============ CLASSIFICATION REPORT ==============")
print(Report)

tn, fp, fn, tp = Confusion_Matrix.ravel()
Accuracy = (tn+tp)/(tn + fp + fn + tp)

print("Accuracy: ", Accuracy*100)


True Subjective :  33240
True Objective :  7166
False Subjective :  13805
False Objective :  4368
Accuracy:  68.97693712763959
MEAN ABSOLUTE ERROR:  0.31


[[ 7166 13805]
 [ 4368 33240]]


              precision    recall  f1-score   support

           0       0.62      0.34      0.44     20971
           1       0.71      0.88      0.79     37608

   micro avg       0.69      0.69      0.69     58579
   macro avg       0.66      0.61      0.61     58579
weighted avg       0.68      0.69      0.66     58579

Accuracy:  68.97693712763959
