In [1]:
import pickle
import time

import pandas as pd
import numpy as np

from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import optimizers

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def Create_NN_Model(No_Features=13, No_Hidden_Layers=2, No_Hidden_Neurons=7, 
                    Hidden_Activation ="relu", No_OP_Neurons=1, 
                    Output_Activation="sigmoid", Kernel_Initializer="random_uniform",
                    Optimizer="rmsprop", Loss='binary_crossentropy', Metrics =['accuracy']):
  
    classifier = Sequential()

  ## Input Layer
    classifier.add(Dense(No_Hidden_Neurons, activation=Hidden_Activation, 
                       kernel_initializer=Kernel_Initializer, input_dim=No_Features))
  
  ## Hidden layers
    for i in range(No_Hidden_Layers):
        classifier.add(Dense(No_Hidden_Neurons, activation=Hidden_Activation, 
                         kernel_initializer=Kernel_Initializer))
    
  ## Output Layer
    classifier.add(Dense(No_OP_Neurons, activation=Output_Activation, 
                       kernel_initializer=Kernel_Initializer))
  
    classifier.compile(optimizer =Optimizer, loss=Loss, metrics = Metrics)

    return classifier

In [3]:
def Train_NN(NN_classifier, train_data, feature_list=[], Label_Col="Label", Batch_Size=64, Epochs=20):

    train_data.dropna()
    train_data = pd.DataFrame(np.nan_to_num(np.array(train_data)), columns = train_data.columns)
    train_data[Label_Col] = pd.to_numeric(train_data[Label_Col], errors='coerce')
    train_data = train_data.dropna(subset=[Label_Col])
  
    train_features = train_data[feature_list]    
    train_labels = train_data[Label_Col]
    train_labels = train_labels.astype('int')

    NN_classifier.fit(train_features,train_labels, batch_size=Batch_Size, epochs=Epochs)

    eval_model=NN_classifier.evaluate(train_features, train_labels)
    print("Loss: ", eval_model[0])
    print("Accuracy of the model: ", eval_model[1])

    return NN_classifier

In [4]:
## Store trained model in a file to reuse in other codes without training again on same data

def Store_Trained_NN(NN_obj, Filepath):
  
    with open(Filepath, "wb") as file:
        pickle.dump(NN_obj, file)

In [5]:
## Load stored trained model and returns random forest model object

def Load_Trained_NN(Filepath):
  
    with open(Filepath, "rb") as file:
        NN_obj = pickle.load(file)

    return NN_obj

In [6]:
def Evaluate_NN(test_data, NN_Model_FilePath, feature_list=[], Label_Col="label", threshold=0.5):
  
    test_data.dropna()
    test_data = pd.DataFrame(np.nan_to_num(np.array(test_data)),  columns = test_data.columns)
    test_data[Label_Col] = pd.to_numeric(test_data[Label_Col], errors='coerce')
    test_data = test_data.dropna(subset=[Label_Col])

    test_features = test_data[feature_list]
    test_labels = test_data[Label_Col]
    test_labels = test_labels.astype('int')

    NN_obj = Load_Trained_NN(NN_Model_FilePath) 
    predictions = NN_obj.predict(test_features)
    predictions_list = [int(p[0]) for p in predictions]
      
    true_subjective = 0
    true_objective = 0
    false_subjective = 0
    false_objective = 0
    
    for i in range(len(predictions_list)):
        if predictions_list[i] >= threshold:
            predictions_list[i] = 1
            if test_labels[i] == 1:
                true_subjective += 1
            else:
                false_subjective += 1
        else:
            predictions_list[i] = 0
            if test_labels[i] == 0:
                true_objective += 1
            else:
                false_objective += 1
      
    errors = abs(predictions_list - test_labels)

  # Calculate mean absolute error (MAE)
    MAE = round(np.mean(errors), 2)
  
  ## Confusion Matrix and Classification Report
    Confusion_Matrix = confusion_matrix(test_labels,predictions_list)
    Report = classification_report(test_labels,predictions_list)
    
    print("True Subjective : ", true_subjective)
    print("True Objective : ", true_objective)
    print("False Subjective : ", false_subjective)
    print("False Objective : ", false_objective)
    
    print("Accuracy: ", (true_subjective+true_objective)/(true_subjective+true_objective+false_subjective+false_objective)*100)
  
    return MAE, Confusion_Matrix, Report

In [8]:
Column_List = ["word"]
Label_Col = "Label"
Vector_Size = 26
Feature_Cols = ["POS", "POS_Prev", "POS_Next", "Sent_Position", 
                  "Hedge", "Hedge_Context", "Factive", "Factive_Context", "Assertive", "Assertive_Context",
                  "Implicative", "Implicative_Context", "Report", "Report_Context", 
                   "Entailment", "Entailment_Context", "StrongSub", "StrongSub_Context", 
                   "WeakSub", "WeakSub_Context", "Polarity", "Positive", "Positive_Context", 
                   "Negative", "Negative_Context", "Bias_Lexicon"]
Vector_Size = 13
FeatureList = ["POS", "Hedge", "Factive", "Assertive", "Implicative", "Report", "Entailment", "StrongSub", 
               "WeakSub", "Polarity", "Positive", "Negative", "Bias_Lexicon"]
Column_List.extend(Feature_Cols)
Column_List.append(Label_Col)

Folder_Path = "/Users/pranjali/Downloads/Wiki_BiasDetection/Data/Task2_FinalData/"

Train_Features_FilePath = Folder_Path + "Wiki_LiguisticFeatures_Train.csv"
Test_Features_FilePath = Folder_Path + "Wiki_LiguisticFeatures_Test.csv"
NN_Model_FilePath = "/Users/pranjali/Downloads/Wiki_BiasDetection/Saved_Models/Task2_NN_Trained_Model.pkl"

train_data = pd.read_csv(Train_Features_FilePath, usecols=Column_List)
test_data = pd.read_csv(Test_Features_FilePath, usecols=Column_List)

In [9]:
print("train_data shape: ", train_data.shape)
print("test_data shape: ", test_data.shape)

train_data shape:  (510467, 28)
test_data shape:  (199992, 28)


In [10]:
## Training Phase
start_time = time.time()

NN_Classifier = Create_NN_Model()
NN_obj = Train_NN(NN_Classifier, train_data, FeatureList, Label_Col)
Store_Trained_NN(NN_obj, NN_Model_FilePath)

end_time = time.time()
print("Time required for training: ", end_time - start_time )

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Loss:  0.35049273095981187
Accuracy of the model:  0.8809560537338257
Time required for training:  194.04473114013672


In [11]:
start_time = time.time()

MAE, Confusion_Matrix, Report = Evaluate_NN(test_data, NN_Model_FilePath, FeatureList, Label_Col, 0.5)

print("MEAN ABSOLUTE ERROR: ", MAE)

print("\n")
print("============ CONFUSION MATRIX ===============")
print(Confusion_Matrix)

print("\n")
print("============ CLASSIFICATION REPORT ==============")
print(Report)

tn, fp, fn, tp = Confusion_Matrix.ravel()
Accuracy = (tn+tp)/(tn + fp + fn + tp)

print("Accuracy: ", Accuracy*100)

end_time = time.time()
print("Time required for testing: ", end_time - start_time )

True Subjective :  0
True Objective :  169096
False Subjective :  0
False Objective :  30896
Accuracy:  84.5513820552822
MEAN ABSOLUTE ERROR:  0.15


[[169096      0]
 [ 30896      0]]


              precision    recall  f1-score   support

           0       0.85      1.00      0.92    169096
           1       0.00      0.00      0.00     30896

   micro avg       0.85      0.85      0.85    199992
   macro avg       0.42      0.50      0.46    199992
weighted avg       0.71      0.85      0.77    199992

Accuracy:  84.5513820552822
Time required for testing:  4.5814049243927


  'precision', 'predicted', average, warn_for)


In [12]:
## Cross Validation

train_data.dropna()
train_data = pd.DataFrame(np.nan_to_num(np.array(train_data)), columns = train_data.columns)
train_data[Label_Col] = pd.to_numeric(train_data[Label_Col], errors='coerce')
train_data = train_data.dropna(subset=[Label_Col])

train_features = train_data[FeatureList]    
train_labels = train_data[Label_Col]
train_labels = train_labels.astype('int')

# create the sklearn model for the network
model_CV = KerasClassifier(build_fn=Create_NN_Model, verbose=1)

# we choose the initializers that came at the top in our previous cross-validation!!
# kernel_initializer = ['random_uniform']
# No_Hidden_Layers = [1, 2]
# No_Hidden_Neurons= [10, 30, 50]
# optimizer = ['adam', 'rmsprop']

# batches = [64*x for x in range(1, 3)]
# epochs = [50, 100, 150]

kernel_initializer = ['ones','random_uniform']
No_Hidden_Layers = [1, 2]
No_Hidden_Neurons= [5, 7]
optimizer = ['adam', 'rmsprop']

batches = [64*x for x in range(1, 3)]
epochs = [20, 50]

## We can also try different learning rates for optimizers. 
## For this create different objects of optimizers with different learning rates and pass list of objects

# grid search for initializer, batch size and number of epochs
param_grid = dict(epochs=epochs, batch_size=batches, Kernel_Initializer=kernel_initializer, 
                 No_Hidden_Layers=No_Hidden_Layers, Optimizer=optimizer)
grid = GridSearchCV(estimator=model_CV, param_grid=param_grid, cv=3, n_jobs=4, refit=True, verbose=2)
grid_result = grid.fit(train_features, train_labels)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 50.3min
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed: 142.3min finished


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
# print results of cross validation

print(f'Best Accuracy for {grid_result.best_score_} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f' mean={mean:.4}, std={stdev:.4} using {param}')

Best Accuracy for 0.8809560707131984 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 1, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 20}
 mean=0.1191, std=0.001195 using {'Kernel_Initializer': 'ones', 'No_Hidden_Layers': 1, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 20}
 mean=0.1191, std=0.001195 using {'Kernel_Initializer': 'ones', 'No_Hidden_Layers': 1, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 50}
 mean=0.1191, std=0.001195 using {'Kernel_Initializer': 'ones', 'No_Hidden_Layers': 1, 'Optimizer': 'adam', 'batch_size': 128, 'epochs': 20}
 mean=0.1191, std=0.001195 using {'Kernel_Initializer': 'ones', 'No_Hidden_Layers': 1, 'Optimizer': 'adam', 'batch_size': 128, 'epochs': 50}
 mean=0.1191, std=0.001195 using {'Kernel_Initializer': 'ones', 'No_Hidden_Layers': 1, 'Optimizer': 'rmsprop', 'batch_size': 64, 'epochs': 20}
 mean=0.1191, std=0.001195 using {'Kernel_Initializer': 'ones', 'No_Hidden_Layers': 1, 'Optimizer': 'rmsprop', 'batch_size': 64, 'epoch

In [15]:
# ## Train again using best parameter values identified by cross validation and store the trained model

# ## Training Phase
# NN_Classifier = Create_NN_Model(No_Hidden_Layers=grid_result.best_params_.No_Hidden_Layers, No_Hidden_Neurons=grid_result.best_params_.No_Hidden_Neurons, Kernel_Initializer=grid_result.best_params_.Kernel_Initializer, Optimizer=grid_result.best_params_.Optimizer)
# NN_obj = Train_NN(NN_Classifier, train_data, Embedding_Cols, Label_Col, Batch_Size=grid_result.best_params_.batch_size, Epochs=grid_result.best_params_.epochs)
# Store_Trained_NN(NN_obj, NN_Model_FilePath)

## Store models trained with best parameters
Store_Trained_NN(grid, NN_Model_FilePath)

## Evaluation of above model on validation data
MAE, Confusion_Matrix, Report = Evaluate_NN(test_data, NN_Model_FilePath, FeatureList, Label_Col, 0.5)

print("MEAN ABSOLUTE ERROR: ", MAE)

print("\n")
print("============ CONFUSION MATRIX ===============")
print(Confusion_Matrix)

print("\n")
print("============ CLASSIFICATION REPORT ==============")
print(Report)

tn, fp, fn, tp = Confusion_Matrix.ravel()
Accuracy = (tn+tp)/(tn + fp + fn + tp)

print("Accuracy: ", Accuracy*100)


True Subjective :  0
True Objective :  169096
False Subjective :  0
False Objective :  30896
Accuracy:  84.5513820552822
MEAN ABSOLUTE ERROR:  0.15


[[169096      0]
 [ 30896      0]]


              precision    recall  f1-score   support

           0       0.85      1.00      0.92    169096
           1       0.00      0.00      0.00     30896

   micro avg       0.85      0.85      0.85    199992
   macro avg       0.42      0.50      0.46    199992
weighted avg       0.71      0.85      0.77    199992

Accuracy:  84.5513820552822
