In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import CountVectorizer
from wilds.common.data_loaders import get_train_loader
import torchvision.transforms as transforms
from sklearn.linear_model import LogisticRegression


df = pd.read_csv("all_data_with_identitiesEmbedded.csv")

In [8]:
def CleanText(text):
    text = re.sub(r'''[\[|\]]''', "", text).split()
    text = np.array(text, dtype="float64")
    return text

In [9]:
df = pd.read_csv("all_data_with_identitiesEmbedded.csv")

df = df.loc[:, ["comment_text", "split", "na_gender", "toxicity", "male", "female", "transgender"]]
df['toxicity'] = df['toxicity'].apply(lambda score: np.round(score))
df['comment_text'] = df['comment_text'].apply(lambda text: CleanText(text))

In [10]:
# Updating values for training_data
training_data = df[df['split'] == 'train']
training_data = training_data[training_data['na_gender'] == 0]

training_data['male'] = training_data['male'].apply(lambda x: round(x))
training_data['female'] = training_data['female'].apply(lambda x: round(x))
training_data['transgender'] = training_data['transgender'].apply(lambda x: round(x))
training_data['other_gender'] = training_data['transgender'].apply(lambda x: round(x))

# Getting test_data
test_data = df[df['split'] == 'test']

# Getting validation_data
validation_data = df[df['split'] == 'val']

In [11]:
training_data

Unnamed: 0,comment_text,split,na_gender,toxicity,male,female,transgender,other_gender
3,"[-0.0089682322, 0.642603219, -0.315957218, -0....",train,0,1.0,1,1,0,0
13,"[-0.175996631, 0.308328658, 0.0807084367, -0.3...",train,0,1.0,1,0,0,0
21,"[-0.160611436, 0.139930561, -0.460124165, -0.2...",train,0,0.0,1,0,0,0
31,"[0.253084868, 0.0796071365, -0.0367758051, -0....",train,0,1.0,1,1,0,0
46,"[-0.0132263461, 0.247382224, 0.0110267261, 0.2...",train,0,1.0,0,1,0,0
...,...,...,...,...,...,...,...,...
447978,"[0.459464222, 0.271994978, -0.265816182, -0.28...",train,0,0.0,1,1,0,0
447987,"[0.338637114, 0.633257389, -0.243543133, -0.59...",train,0,0.0,0,1,0,0
447995,"[0.436112642, 0.277279615, -0.205808342, -0.08...",train,0,0.0,1,0,0,0
447996,"[0.477457464, 0.291201234, 0.0400100239, -0.74...",train,0,0.0,1,0,1,1


In [16]:
trainingComments = training_data['comment_text'].values.tolist()
trainingLabels = training_data['toxicity']

testComments = test_data['comment_text'].values.tolist()
testLabels = test_data['toxicity'].values.tolist()

valComments = validation_data['comment_text'].values.tolist()
valLabels = validation_data['toxicity']


In [17]:
class AdalineGDTrainWorst():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.w = w
        self.bestW = None
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        
    def CheckAccuracy(self, predictions, labels):
        acc = 0.0
        for i in range(len(predictions)):
            if (np.round(predictions[i]) == np.round(labels[i])):
                acc += 1
        return acc/len(predictions)
    
    def CheckLoss(self, predictions, labels):
        predictions = np.array(predictions).reshape(len(predictions), 1)
        labels = np.array(labels).reshape(len(labels), 1)
        loss = np.sum(np.absolute(np.subtract(predictions, labels)))
        return loss/len(predictions)
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        output = np.dot(X_test, self.w)
        # Sigmoid giving 0-1 results
        preds = np.round(1/(1+np.exp(-output)))
        return preds
        
    def fit(self, dfTrain, dfVal):
        # Making sure that arrays are numpy arrays
        
        X_train = dfTrain['comment_text'].values.tolist()
        Y_train = dfTrain['toxicity'].values.tolist()
        
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        X_val = dfVal['comment_text'].values.tolist()
        Y_val = dfVal['toxicity'].values.tolist()
        
        X_val = np.array(X_val).reshape((len(X_val), -1))
        Y_val = np.array(Y_val).reshape((len(Y_val), 1))
        
        # Initializing w vector using random normal distribution
        if self.w == None:
            self.w = np.random.normal(0, 0.1, len(X_train[0])).reshape(len(X_train[0]), 1)
        
        # Updating w vector for each sample
        output = np.dot(X_train, self.w)
        error = (Y_train - output)            
        self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)
        
        dic = {}
        for col in dfTrain.columns[4:]:
            tempSet = dfTrain[dfTrain[col] == 1]
            tempLabels = tempSet['toxicity'].values.tolist()
            tempVecs = tempSet['comment_text'].values.tolist()
            dic[col] = [tempVecs, tempLabels]
        
        valPreds = self.predict(X_val)
        bestAcc = self.CheckAccuracy(valPreds, Y_val)
        badEpoch = 0
        self.bestW = self.w
        early_stopping = 5
        
        # Using n epochs
        for i in range(self.n_iter):
            accuracies = []
            for col in dfTrain.columns[4:]:
                tempPredictions = self.predict(dic[col][0])
                currentAccuracy = self.CheckAccuracy(tempPredictions, dic[col][1])
                accuracies.append([currentAccuracy, col])
            accuracies = np.array(accuracies)
            if i == 0:                
                print(accuracies)
            
            worstCol = accuracies[np.argmin(accuracies[:,0])][1]
            
            #print(worstCol)
            X_train = dic[worstCol][0]
            Y_train = dic[worstCol][1]
            
            X_train = np.array(X_train).reshape((len(X_train), -1))
            Y_train = np.array(Y_train).reshape((len(Y_train), 1))
            
            # Updating w vector for each sample
            output = np.dot(X_train, self.w)
            error = (Y_train - output)
            self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)
            
            valPreds = self.predict(X_val)
            acc = self.CheckAccuracy(valPreds, Y_val)
            print(acc, i)
            
            # Saving the best model and also checks for Early_Stopping
            if acc > bestAcc:
                bestAcc = acc
                badEpoch = 0
                self.bestW = self.w
            else:
                badEpoch += 1

            if badEpoch >= early_stopping:
                self.w = self.bestW
                accuracies = []
                for col in dfTrain.columns[4:]:
                    tempPredictions = self.predict(dic[col][0])
                    currentAccuracy = self.CheckAccuracy(tempPredictions, dic[col][1])
                    accuracies.append([currentAccuracy, col])
                accuracies = np.array(accuracies)
                print(accuracies)
                
                print("Stopped cause of bad Epoch in iteration: ", i)
                break
        return self

In [21]:


model = AdalineGDTrainWorst(learning_rate = 0.1, n_iter = 100)

model.fit(training_data, validation_data)

predictions = model.predict(testComments)

Y_test = np.array(testLabels)
Y_test = Y_test.reshape(len(Y_test), 1)


print("Accuracy on testSet after round", model.CheckAccuracy(predictions, Y_test))


[['0.8963709377556333' 'male']
 ['0.9067482874216587' 'female']
 ['0.8702594810379242' 'transgender']
 ['0.8702594810379242' 'other_gender']]
0.07864099158919877 0


  preds = np.round(1/(1+np.exp(-output)))


0.9213590084108012 1
0.07864099158919877 2
0.9213590084108012 3
0.07864099158919877 4
[['0.10362906224436677' 'male']
 ['0.09325171257834135' 'female']
 ['0.12974051896207583' 'transgender']
 ['0.12974051896207583' 'other_gender']]
Stopped cause of bad Epoch in iteration:  4
[1.]
[0.08102734]
Accuracy on testSet after round 0.08102734299083583


In [22]:
training_data

Unnamed: 0,comment_text,split,na_gender,toxicity,male,female,transgender,other_gender
3,"[-0.0089684455, 0.6426031, -0.3159568, -0.1167...",train,0,1.0,1,1,0,0
13,"[-0.17599669, 0.30832914, 0.080708444, -0.3494...",train,0,1.0,1,0,0,0
21,"[-0.1606114, 0.13993065, -0.4601244, -0.287369...",train,0,0.0,1,0,0,0
31,"[0.25308475, 0.079606794, -0.03677589, -0.0845...",train,0,1.0,1,1,0,0
46,"[-0.013226508, 0.24738197, 0.011026988, 0.2162...",train,0,1.0,0,1,0,0
...,...,...,...,...,...,...,...,...
188927,"[0.031083155, 0.119434625, -0.11237992, 0.2976...",train,0,0.0,1,0,0,0
188930,"[0.0034725324, -0.11189864, -0.42127734, 0.012...",train,0,0.0,0,1,0,0
188972,"[-0.21764557, 0.3955124, -0.17079373, -0.22168...",train,0,0.0,0,1,0,0
188980,"[0.21808432, 0.5820606, -0.12626122, -0.082670...",train,0,0.0,0,1,0,0


In [12]:
class AdalineGDNormal():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.wArr = np.ones((n_iter, 2))
        self.w = w
        self.learning_rate = learning_rate
        self.n_iter = n_iter
    
    def fit(self, X_train, Y_train):
        # Making sure that arrays are numpy arrays
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        Y_train = np.array(list(map(float, Y_train)))
        # Initializing w vector using random normal distribution
        if self.w == None:
            self.w = np.random.uniform(-3, 3, len(X_train[0]))
        
        # Using n epochs
        for i in range(self.n_iter):
            self.wArr[i][0] = self.w[0]
            self.wArr[i][1] = self.w[1]
            
            # Updating w vector for each sample
            output = np.dot(X_train, self.w)
            error = (Y_train - output)            
            self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)            
        return self
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        # Using the sign function as activation function since labels are {1, -1}
        preds = np.dot(X_test, self.w)
        
        for i in range(len(preds)):
            if preds[i] <= 0:
                preds[i] = 0
            else:
                preds[i] = 1
        return preds

In [40]:
#from sklearn.linear_model import LinearRegression

træningNice
valNice
testnice



Y_train = np.array(trainingLabels.values.tolist()).reshape(len(trainingLabels), 1)

model = AdalineGDNormal(learning_rate = 0.01, n_iter = 100)
model.fit(X_trainForw2vTransformed, Y_train)

Y_test = np.array(testLabels.values.tolist())
predictions = model.predict(X_testForw2vTransformed)


def CheckAccuracy(predictions, labels):
    acc = 0.0
    for i in range(len(predictions)):
        if (predictions[i] == round(labels[i])):
            acc += 1
    return acc/len(predictions)

CheckAccuracy(predictions, Y_test)


2210.0
1760.0


0.5586666666666666