In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import CountVectorizer
from wilds.common.data_loaders import get_train_loader
import torchvision.transforms as transforms
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec

def CleanText(text):
    text = text.lower() #Turn all text entries into lower-case
    text = re.sub(r'''(https?:\/\/www\.|https?:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,3}[-a-zA-Z0-9()@:%_\+.~#?&\//=<>]*''', "<URL>", text)
    #Replace URL with tag
    text = re.sub(r'''[0-9]+[/\-.]+[0-9]+[/\-.]+[0-9]+''', "<DATE>", text) #Replace dates with tag
    text = re.sub(r'''[a-z0-9._%+-]+\@[a-z0-9.-]+[a-z0-9]\.[a-z]{1,}''', "<EMAIL>", text)
    text = re.sub(r'''[0-9]+''', "<NUM>", text) #Replace numbers with tag
    
    text = re.sub(r'''[.|,|!|?|\'|\''|\"|\n|\t|\-|\(|\)]''', '', text)
    text = re.sub(r'''^\s+|\s+$''', '', text) #Remove whitespaces at the end of string
    text = re.sub(r'''[ ][ ]+|_''', " ", text) #Remove multiple whitespace
    return text

df = pd.read_csv("all_data_with_identities.csv")

In [3]:
df = df.loc[:, ["comment_text", "split", "na_gender", "toxicity", "male", "female", "transgender", "other_gender"]]
print(df)

                                             comment_text  split  na_gender  \
0       OH yes - Were those evil Christian Missionarie...   test          1   
1       Why is this black racist crap still on the G&M...    val          1   
2                              even up here.......BLACKS!  train          1   
3       Blame men.  There's always an excuse to blame ...  train          0   
4       And the woman exposing herself saying grab thi...    val          0   
...                                                   ...    ...        ...   
447995  Another man shamming article. If white men did...  train          0   
447996  "no matter what is put in front of you regardi...  train          0   
447997  The Democrat party aided and abetted by it's M...   test          1   
447998  I just don't find her a very good representati...  train          0   
447999  You know the Trump fanatics are trolling the G...  train          1   

        toxicity      male    female  transgender  

In [4]:
training_data = df[df['split'] == 'train']
training_data = training_data[training_data['na_gender'] == 0][:70000]

test_data = df[df['split'] == 'test'][:15000]
validation_data = df[df['split'] == 'val'][:15000]

training_data['comment_text'] = training_data['comment_text'].apply(lambda text: CleanText(text))
training_data['male'] = training_data['male'].apply(lambda x: round(x))
training_data['female'] = training_data['female'].apply(lambda x: round(x))
training_data['transgender'] = training_data['transgender'].apply(lambda x: round(x))
training_data['other_gender'] = training_data['transgender'].apply(lambda x: round(x))

test_data['comment_text'] = test_data['comment_text'].apply(lambda text: CleanText(text))
validation_data['comment_text'] = validation_data['comment_text'].apply(lambda text: CleanText(text))

In [5]:
trainingComments = training_data['comment_text']
trainingLabels = training_data['toxicity']
trainingComments = trainingComments.values.tolist()

testComments = test_data['comment_text']
testComments = testComments.values.tolist()
testLabels = test_data['toxicity']

valComments = validation_data['comment_text']
valComments = valComments.values.tolist()
valLabels = validation_data['toxicity']

In [6]:
training_data

Unnamed: 0,comment_text,split,na_gender,toxicity,male,female,transgender,other_gender
3,blame men theres always an excuse to blame men...,train,0,0.545455,1,1,0,0
13,are you a pilgrimwhy arnt you growing your own...,train,0,0.800000,1,0,0,0
21,no he was accused of being a racist white man,train,0,0.363636,1,0,0,0
31,how do we fight agaisnt women who use sexual f...,train,0,0.800000,1,1,0,0
46,add this small and annoying irrelevant story t...,train,0,0.594595,0,1,0,0
...,...,...,...,...,...,...,...,...
447978,brother williamso you are opposed to women ser...,train,0,0.400000,1,1,0,0
447987,i dont think i can picture christ ever saying ...,train,0,0.400000,0,1,0,0
447995,another man shamming article if white men did ...,train,0,0.400000,1,0,0,0
447996,no matter what is put in front of you regardin...,train,0,0.400000,1,0,1,1


In [7]:
for col in training_data.columns[4:]:
    print(col)

male
female
transgender
other_gender


In [19]:
class AdalineGDTrainWorst():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.w = w
        self.bestW = None
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        
    def CheckAccuracy(self, predictions, labels):
        print("Percentage of toxic in predictions: ", sum(predictions)/len(predictions))
        print("Percentage of toxic in labels: ", sum(labels)/len(labels))
        acc = 0.0
        for i in range(len(predictions)):
            if (np.round(predictions[i]) == np.round(labels[i])):
                acc += 1
        return acc/len(predictions)
    
    def CheckLoss(self, predictions, labels):
        predictions = np.array(predictions).reshape(len(predictions), 1)
        labels = np.array(labels).reshape(len(labels), 1)
        loss = np.sum(np.absolute(np.subtract(predictions, labels)))
        return loss/len(predictions)
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        
        preds = np.dot(X_test, self.w)
        return preds
    
    # For adding noise 
    def addNoise(self, X_train):
        #Standard deviation set to 0.1
        #Inspiration from https://www.researchgate.net/post/How-to-add-some-noise-data-to-my-classification-datasets
        variance, scale = 0.1, 1
        noise = np.random.normal(0, variance, len(X_train[0]))
        #noise = noise.reshape(len(X_train[0]), 1)
        X_train += (noise/scale)
        return X_train
        
    def fit(self, dfTrain, dfVal):
        # Making sure that arrays are numpy arrays
        
        X_train = dfTrain['comment_text'].values.tolist()
        Y_train = dfTrain['toxicity'].values.tolist()
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        X_val = dfVal['comment_text'].values.tolist()
        Y_val = dfVal['toxicity'].values.tolist()
        
        X_val = np.array(X_val).reshape((len(X_val), -1))
        Y_val = np.array(Y_val).reshape((len(Y_val), 1))
        
        #Add noise to traning data
        print('X_train before noise: ', X_train)
        print(len(X_train[0]))
        X_train = self.addNoise(X_train)
        
        print('X_train after noise: ', X_train)
        print(len(X_train[0]))
        
        # Initializing w vector using random normal distribution
        if self.w == None:
            self.w = np.random.normal(0, 0.1, len(X_train[0])).reshape(len(X_train[0]), 1)
            #self.w = np.zeros(len(X_train[0])).reshape(len(X_train[0]), 1)
        
        # Updating w vector for each sample
        output = np.dot(X_train, self.w)
        error = (Y_train - output)            
        self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)
        
        dic = {}
        for col in dfTrain.columns[4:]:
            tempSet = dfTrain[dfTrain[col] == 1]
            tempLabels = tempSet['toxicity'].values.tolist()
            tempVecs = tempSet['comment_text'].values.tolist()
            dic[col] = [tempVecs, tempLabels]
        
        
        tmpPredictions = self.predict(X_val)
        for i in range(len(tmpPredictions)):
            if tmpPredictions[i] < 0:
                tmpPredictions[i] = 0
            if tmpPredictions[i] > 1:
                tmpPredictions[i] = 1
        
        bestLoss = self.CheckLoss(tmpPredictions, Y_val)
        badEpoch = 0
        self.bestW = self.w
        early_stopping = 10
        
        # Using n epochs
        for i in range(self.n_iter):
            losses = []
            for col in dfTrain.columns[4:]:
                tempPredictions = self.predict(dic[col][0])
                currentLoss = self.CheckLoss(tempPredictions, dic[col][1])
                losses.append([currentLoss, col])
            losses = np.array(losses)            
            worstCol = losses[np.argmax(losses[:,0])][1]
            
            X_train = dic[worstCol][0]
            Y_train = dic[worstCol][1]
            
            X_train = np.array(X_train).reshape((len(X_train), -1))
            Y_train = np.array(Y_train).reshape((len(Y_train), 1))
            
            # Updating w vector for each sample
            output = np.dot(X_train, self.w)
            #print(output)
            error = (Y_train - output)
            self.w += (1/len(Y_train)) * (self.learning_rate) * np.dot(X_train.T, error)
            
            tmpPredictions = self.predict(X_val)
            for i in range(len(tmpPredictions)):
                if tmpPredictions[i] < 0:
                    tmpPredictions[i] = 0
                if tmpPredictions[i] > 1:
                    tmpPredictions[i] = 1
            l = self.CheckLoss(tmpPredictions, Y_val)
            print(l)
            # Saving the best model and also checks for Early_Stopping
            
            if l < bestLoss:
                bestLoss = l
                badEpoch = 0
                self.bestW = self.w
            else:
                badEpoch += 1

            if badEpoch >= early_stopping:
                self.w = self.bestW
                print("Stopped cause of bad Epoch: ", badEpoch)
                break
            
        return self

In [12]:
# Preparing reviews in list of lists format
X_trainForw2v = []
X_testForw2v = []
X_valForw2v = []

for sentence in trainingComments:
    X_trainForw2v.append(sentence.split(' '))

for sentence in testComments:
    X_testForw2v.append(sentence.split(' '))
    
for sentence in valComments:
    X_valForw2v.append(sentence.split(' '))

X_trainForw2v = np.array(X_trainForw2v)
X_testForw2v = np.array(X_testForw2v)
X_valForw2v = np.array(X_valForw2v)

  X_trainForw2v = np.array(X_trainForw2v)
  X_testForw2v = np.array(X_testForw2v)
  X_valForw2v = np.array(X_valForw2v)


In [13]:
# Do not want words with overall less than 3 appearances to count, generating vector of size 200 for each word
modelw2v = Word2Vec(X_trainForw2v, min_count=3, size=50, iter=60)

In [14]:
def TransformToEmbedding(model, data):
    transformed = np.zeros((len(data), model.vector_size))
    for i, sentence in enumerate(data):
        currentLst = []
        for word in sentence:
            if word in model.wv.vocab.keys():
                currentLst.append(model.wv[word])
        if (len(currentLst) > 0):
            currentLst = np.mean(np.array(currentLst), axis=0)
        else:
            currentLst = np.zeros(model.vector_size)
        transformed[i] = np.array(currentLst)
    return np.array(transformed)


X_trainForw2vTransformed = TransformToEmbedding(modelw2v, X_trainForw2v)
X_testForw2vTransformed = TransformToEmbedding(modelw2v, X_testForw2v)
X_valForw2vTransformed = TransformToEmbedding(modelw2v, X_valForw2v)

for i, ind in enumerate(training_data.index):
    training_data['comment_text'][ind] = X_trainForw2vTransformed[i]
    
for i, ind in enumerate(test_data.index):
    test_data['comment_text'][ind] = X_testForw2vTransformed[i]

for i, ind in enumerate(validation_data.index):
    validation_data['comment_text'][ind] = X_valForw2vTransformed[i]
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['comment_text'][ind] = X_trainForw2vTransformed[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['comment_text'][ind] = X_testForw2vTransformed[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_data['comment_text'][ind] = X_valForw2vTransformed[i]


In [20]:
modelEmbed = AdalineGDTrainWorst(learning_rate = 0.1, n_iter = 200)

modelEmbed.fit(training_data, validation_data)

predictions = modelEmbed.predict(X_testForw2vTransformed)

Y_test = np.array(testLabels.values.tolist())

Y_test = Y_test.reshape(len(Y_test), 1)


for i in range(len(predictions)):
    if predictions[i] < 0:
        predictions[i] = 0
    if predictions[i] > 1:
        predictions[i] = 1
        

print("Loss on testSet: ", modelEmbed.CheckLoss(predictions, Y_test))
print("Accuracy on testSet after round", modelEmbed.CheckAccuracy(predictions, Y_test))

X_train before noise:  [[-0.1232876  -0.36606708 -0.45975876 ... -0.66991556 -0.62694448
  -0.10444096]
 [-0.17627874 -0.19016303  0.53924394 ... -1.07931888 -0.3683891
  -0.10026253]
 [ 1.45677042  0.53040957 -0.86334336 ...  0.8318761   0.00389695
  -0.11123829]
 ...
 [-0.76163304 -0.04457362 -0.02443874 ... -0.79273885  0.13171329
  -1.25653255]
 [-0.15953861 -0.34217823 -0.55229187 ... -0.07737934 -0.11317515
   0.39693567]
 [-0.36085093  0.2753478   0.62797242 ... -0.68293589 -0.04864882
   0.18029332]]
50
X_train after noise:  [[ 0.03316698 -0.29933975 -0.54695781 ... -0.65058272 -0.71234838
  -0.03216657]
 [-0.01982416 -0.1234357   0.45204488 ... -1.05998604 -0.453793
  -0.02798813]
 [ 1.613225    0.59713691 -0.95054241 ...  0.85120894 -0.08150695
  -0.03896389]
 ...
 [-0.60517845  0.02215372 -0.11163779 ... -0.77340602  0.04630939
  -1.18425815]
 [-0.00308403 -0.27545089 -0.63949092 ... -0.0580465  -0.19857905
   0.46921007]
 [-0.20439635  0.34207513  0.54077337 ... -0.66360306