In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from wilds.common.data_loaders import get_train_loader
import torchvision.transforms as transforms
import matplotlib.pyplot as plt



In [2]:
def CleanText(text):
    text = re.sub(r'''[\[|\]]''', "", text).split()
    text = np.array(text, dtype="float64")
    return text

In [3]:
df = pd.read_csv("all_data_with_identitiesEmbedded.csv")

df = df.loc[:, ["comment_text", "split", "na_gender", "na_race", "toxicity", "male", "female", "transgender", "black", "white", "asian", "latino"]]

df['toxicity'] = df['toxicity'].apply(lambda score: np.round(score))
df['comment_text'] = df['comment_text'].apply(lambda text: CleanText(text))

df['male'] = df['male'].apply(lambda x: round(x))
df['female'] = df['female'].apply(lambda x: round(x))
df['transgender'] = df['transgender'].apply(lambda x: round(x))
df['black'] = df['black'].apply(lambda x: round(x))
df['white'] = df['white'].apply(lambda x: round(x))
df['asian'] = df['asian'].apply(lambda x: round(x))
df['latino'] = df['latino'].apply(lambda x: round(x))

In [4]:
# Updating values for training_data
training_data = df[df['split'] == 'train']
training_data = training_data[(training_data['na_gender'] == 0) | (training_data['na_race'] == 0)]

# Getting test_data
test_data = df[df['split'] == 'test']
test_data = test_data[(test_data['na_gender'] == 0) | (test_data['na_race'] == 0)]

# Getting validation_data
validation_data = df[df['split'] == 'val']
validation_data = validation_data[(validation_data['na_gender'] == 0) | (validation_data['na_race'] == 0)]

In [5]:
X_train = training_data['comment_text'].values.tolist()
Y_train = training_data['toxicity'].values.tolist()

X_test = test_data['comment_text'].values.tolist()
Y_test = test_data['toxicity'].values.tolist()

X_val = validation_data['comment_text'].values.tolist()
Y_val = validation_data['toxicity'].values.tolist()

In [31]:
class AdalineGDF1():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.w = w
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        
    def CheckAccuracy(self, predictions, labels):
        acc = 0.0
        for i in range(len(predictions)):
            if (predictions[i] == labels[i]):
                acc += 1
        return acc/len(predictions)
    
    def CheckF1Score(self, predictions, labels):
        # As seen in https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
        
        
        return f1_score(labels, predictions)
        
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        output = np.dot(X_test, self.w)
        # Sigmoid giving 0-1 results
        
        preds = np.round(1/(1+np.exp(-output)))
        return preds
    
    def PredictTestSetDemographic(self, dfTest):
        dic = {}
        for col in dfTest.columns[5:]:
            tempSet = dfTest[dfTest[col] == 1]
            tempLabels = tempSet['toxicity'].values.tolist()
            tempVecs = tempSet['comment_text'].values.tolist()
            dic[col] = [tempVecs, tempLabels]
        
        accuracies = []
        for col in dic:
            tempPredictions = self.predict(dic[col][0])
            currentAccuracy = self.CheckAccuracy(tempPredictions, dic[col][1])
            accuracies.append([currentAccuracy, col])
        
        accuracies = np.array(accuracies)
        return accuracies
        
    def fit(self, X_train, Y_train, X_val, Y_val):
        # Making sure that arrays are numpy arrays
        
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        X_val = np.array(X_val).reshape((len(X_val), -1))
        Y_val = np.array(Y_val).reshape((len(Y_val), 1))
        
        # Initializing w vector using random normal distribution
        if self.w == None:
            self.w = np.random.normal(0, 0.1, len(X_train[0])).reshape(len(X_train[0]), 1)
        
        badEpoch = 0
        self.bestW = self.w
        bestF1 = 0
        early_stopping = 100
 
        
        # Using n epochs
        for i in range(self.n_iter):
            output = 1/(1+np.exp(-np.dot(X_train, self.w)))
            error = (Y_train - output)
            self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)
        
            tmpPreds = self.predict(X_val)
            f1 = self.CheckF1Score(tmpPreds, Y_val)
            print('F1_score is: ', f1)
            #print('Accuracy : ', self.CheckAccuracy(tmpPreds, Y_val))
            
            # Saving the best model and also checks for Early_Stopping
            if f1 > bestF1:
                bestF1 = f1
                badEpoch = 0
                self.bestW = self.w
            else:
                badEpoch += 1
            
            if badEpoch >= early_stopping:
                self.w = self.bestW                
                print("Stopped cause of bad Epoch in iteration: ", i)
                break
                
        return self

In [32]:
model = AdalineGDF1(learning_rate = 0.0001, n_iter = 10000)

model.fit(X_train, Y_train, X_val, Y_val)

predictions = model.predict(X_test)

Y_test = np.array(Y_test).reshape(len(Y_test), 1)

print('model accuracy is: ', model.CheckAccuracy(predictions, Y_test))

print('f1_score is: ', model.CheckF1Score(predictions, Y_test))

F1_score is:  0.20896024207783473
F1_score is:  0.2092141834414217
F1_score is:  0.2087559161595673
F1_score is:  0.20897294546688153
F1_score is:  0.2082695252679939
F1_score is:  0.2073909703849108
F1_score is:  0.20673118095401213
F1_score is:  0.20665348577323134
F1_score is:  0.2060752502588885
F1_score is:  0.2063533281398771
F1_score is:  0.20632163945814522
F1_score is:  0.20582084349947716
F1_score is:  0.20482770683925136
F1_score is:  0.204525521838274
F1_score is:  0.20476734981088926
F1_score is:  0.20488837906997265
F1_score is:  0.2043743912157974
F1_score is:  0.20444444444444446
F1_score is:  0.2043548099232554
F1_score is:  0.2039573820395738
F1_score is:  0.20392368610511158
F1_score is:  0.20366525232463664
F1_score is:  0.2036600833484327
F1_score is:  0.20384020384020385
F1_score is:  0.20411146642302422
F1_score is:  0.2039582188015393
F1_score is:  0.20404783808647656
F1_score is:  0.20431893687707645
F1_score is:  0.20377917747313817
F1_score is:  0.20290124604

In [12]:
class OwnF1():
    def __init__(self, y_pred=None, y_true=None):
        self.tp = 0
        self.tn = 0
        self.fp = 0
        self.fn = 0
        self.y_pred = y_pred
        self.y_true = y_true
    
    
    precision = 0
    recall = 0
    def confusion_matrix(self):
        for i in range(len(self.y_true)):
            if self.y_true[i]==1 and self.y_pred[i]==1:
                self.tp += 1
            if self.y_true[i]==0 and self.y_pred[i]==0:
                self.tn += 1
            if self.y_true[i]==0 and self.y_pred[i]==1:
                self.fp += 1
            if self.y_true[i]==1 and self.y_pred[i]==0:
                self.fn += 1
        return [self.tp, self.tn, self.fp, self.fn]
    
    def precision_recall(self):
        self.precision = self.tp/(self.tp+self.fp)
        self.recall = self.tp/(self.tp+self.fn)
        print('Precision : ',self.precision, '\nRecall : ',self.recall)
        
    def f1_score(self):
        f1 = 2*(self.precision*self.recall)/(self.precision+self.recall)
        print('F1 Score : ',f1)
        


In [13]:
model = OwnF1(predictions, Y_test)
res_list = model.confusion_matrix()
print(res_list)
print('True Positives :', res_list[0])
print('True Negatives :', res_list[1])
print('False Positives :', res_list[2])
print('False Negatives :', res_list[3])
model.precision_recall()
model.f1_score()

[3019, 12592, 18073, 1398]
True Positives : 3019
True Negatives : 12592
False Positives : 18073
False Negatives : 1398
Precision :  0.14313483785321449 
Recall :  0.6834955852388499
F1 Score :  0.23670077227645148


In [10]:
forBarPlot = model.PredictTestSetDemographic(test_data)
plt.figure()
x = forBarPlot[:,1]
y = np.array(forBarPlot[:,0].astype(float))
x_pos = [i for i, _ in enumerate(x)]
plt.bar(x_pos, y, color='rgbkymc')
plt.xlabel("\nDemographic source")
plt.ylabel("Accuracy")
plt.xticks(x_pos, x)
plt.show()

AttributeError: 'OwnF1' object has no attribute 'PredictTestSetDemographic'

In [None]:
forBarPlot

In [None]:
np.std(forBarPlot[:,0].astype(float))

In [None]:
print("Antal toxic i predictions: ", sum(predictions))
print("Antal toxic i test set: ", sum(Y_test))
print("Samples i test set: ", len(Y_test))