In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import CountVectorizer
from wilds.common.data_loaders import get_train_loader
import torchvision.transforms as transforms
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec

def CleanText(text):
    text = text.lower() #Turn all text entries into lower-case
    text = re.sub(r'''(https?:\/\/www\.|https?:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,3}[-a-zA-Z0-9()@:%_\+.~#?&\//=<>]*''', "<URL>", text)
    #Replace URL with tag
    text = re.sub(r'''[0-9]+[/\-.]+[0-9]+[/\-.]+[0-9]+''', "<DATE>", text) #Replace dates with tag
    text = re.sub(r'''[a-z0-9._%+-]+\@[a-z0-9.-]+[a-z0-9]\.[a-z]{1,}''', "<EMAIL>", text)
    text = re.sub(r'''[0-9]+''', "<NUM>", text) #Replace numbers with tag
    
    text = re.sub(r'''[.|,|!|?|\'|\''|\"|\n|\t|\-|\(|\)]''', '', text)
    text = re.sub(r'''^\s+|\s+$''', '', text) #Remove whitespaces at the end of string
    text = re.sub(r'''[ ][ ]+|_''', " ", text) #Remove multiple whitespace
    return text

df = pd.read_csv("all_data_with_identities.csv")



In [2]:
df = df.loc[:, ["comment_text", "split", "na_gender", "toxicity", "male", "female", "transgender", "other_gender"]]
df['toxicity'] = df['toxicity'].apply(lambda score: np.round(score))

In [3]:
training_data = df[df['split'] == 'train']
training_data = training_data[training_data['na_gender'] == 0]

test_data = df[df['split'] == 'test']
validation_data = df[df['split'] == 'val']

training_data['comment_text'] = training_data['comment_text'].apply(lambda text: CleanText(text))
training_data['male'] = training_data['male'].apply(lambda x: round(x))
training_data['female'] = training_data['female'].apply(lambda x: round(x))
training_data['transgender'] = training_data['transgender'].apply(lambda x: round(x))
training_data['other_gender'] = training_data['transgender'].apply(lambda x: round(x))

test_data['comment_text'] = test_data['comment_text'].apply(lambda text: CleanText(text))
validation_data['comment_text'] = validation_data['comment_text'].apply(lambda text: CleanText(text))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['comment_text'] = test_data['comment_text'].apply(lambda text: CleanText(text))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_data['comment_text'] = validation_data['comment_text'].apply(lambda text: CleanText(text))


In [14]:
trainingComments = training_data['comment_text'].values.tolist()
trainingLabels = training_data['toxicity']

testComments = test_data['comment_text'].values.tolist()
testLabels = test_data['toxicity']

valComments = validation_data['comment_text'].values.tolist()
valLabels = validation_data['toxicity']


In [15]:
from bert_serving.client import BertClient
bc = BertClient()

træningNice = bc.encode(trainingComments)
valNice = bc.encode(valComments)
testnice = bc.encode(testComments)

here is what you can do:
- or, start a new server with a larger "max_seq_len"


In [31]:
for i, ind in enumerate(training_data.index):
    training_data['comment_text'][ind] = træningNice[i]
    
for i, ind in enumerate(test_data.index):
    test_data['comment_text'][ind] = testnice[i]

for i, ind in enumerate(validation_data.index):
    validation_data['comment_text'][ind] = valNice[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['comment_text'][ind] = træningNice[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['comment_text'][ind] = testnice[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_data['comment_text'][ind] = valNice[i]


In [33]:
validation_data

Unnamed: 0,comment_text,split,na_gender,toxicity,male,female,transgender,other_gender
1,"[0.3159964, 0.07656208, 0.15882693, -0.2909979...",val,1,1.0,0.0,0.0,0.0,0.0
4,"[0.06875101, 0.08204796, 0.23067114, -0.252011...",val,0,1.0,0.0,1.0,0.0,0.0
25,"[-0.06716595, 0.8790409, 0.38451603, -0.210127...",val,1,1.0,0.0,0.0,0.0,0.0
34,"[0.17263372, 0.12910248, -0.40820202, 0.107423...",val,1,1.0,0.0,0.0,0.0,0.0
35,"[0.17178057, 0.16108675, 0.6066077, 0.05778486...",val,1,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
20228,"[0.25878617, 0.04634164, -0.27127722, -0.51333...",val,1,0.0,0.0,0.0,0.0,0.0
20230,"[-0.22109184, 0.15846917, -0.12528083, 0.09382...",val,1,1.0,0.1,0.1,0.0,0.0
20231,"[0.43604422, 0.77578586, 0.21017827, -0.687516...",val,1,1.0,0.1,0.0,0.0,0.0
20235,"[0.11229722, 0.2389478, 0.1843236, -0.09459474...",val,1,1.0,0.2,0.1,0.0,0.0


In [40]:
class AdalineGDTrainWorst():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.w = w
        self.bestW = None
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        
    def CheckAccuracy(self, predictions, labels):
        acc = 0.0
        for i in range(len(predictions)):
            if (np.round(predictions[i]) == np.round(labels[i])):
                acc += 1
        return acc/len(predictions)
    
    def CheckLoss(self, predictions, labels):
        predictions = np.array(predictions).reshape(len(predictions), 1)
        labels = np.array(labels).reshape(len(labels), 1)
        loss = np.sum(np.absolute(np.subtract(predictions, labels)))
        return loss/len(predictions)
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        output = np.dot(X_test, self.w)
        # Sigmoid giving 0-1 results
        preds = np.round(1/(1+np.exp(-output)))
        return preds
        
    def fit(self, dfTrain, dfVal):
        # Making sure that arrays are numpy arrays
        
        X_train = dfTrain['comment_text'].values.tolist()
        Y_train = dfTrain['toxicity'].values.tolist()
        
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        X_val = dfVal['comment_text'].values.tolist()
        Y_val = dfVal['toxicity'].values.tolist()
        
        X_val = np.array(X_val).reshape((len(X_val), -1))
        Y_val = np.array(Y_val).reshape((len(Y_val), 1))
        
        # Initializing w vector using random normal distribution
        if self.w == None:
            self.w = np.random.normal(0, 0.1, len(X_train[0])).reshape(len(X_train[0]), 1)
        
        # Updating w vector for each sample
        output = np.dot(X_train, self.w)
        error = (Y_train - output)            
        self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)
        
        dic = {}
        for col in dfTrain.columns[4:]:
            tempSet = dfTrain[dfTrain[col] == 1]
            tempLabels = tempSet['toxicity'].values.tolist()
            tempVecs = tempSet['comment_text'].values.tolist()
            dic[col] = [tempVecs, tempLabels]
        
        valPreds = self.predict(X_val)
        bestAcc = self.CheckAccuracy(valPreds, Y_val)
        badEpoch = 0
        self.bestW = self.w
        early_stopping = 5
        
        # Using n epochs
        for i in range(self.n_iter):
            accuracies = []
            for col in dfTrain.columns[4:]:
                tempPredictions = self.predict(dic[col][0])
                currentAccuracy = self.CheckAccuracy(tempPredictions, dic[col][1])
                accuracies.append([currentAccuracy, col])
            accuracies = np.array(accuracies)
            if i == 0:                
                print(accuracies)
            
            worstCol = accuracies[np.argmin(accuracies[:,0])][1]
            
            #print(worstCol)
            X_train = dic[worstCol][0]
            Y_train = dic[worstCol][1]
            
            X_train = np.array(X_train).reshape((len(X_train), -1))
            Y_train = np.array(Y_train).reshape((len(Y_train), 1))
            
            # Updating w vector for each sample
            output = np.dot(X_train, self.w)
            error = (Y_train - output)
            self.w += (1/len(Y_train)) * (self.learning_rate) * np.dot(X_train.T, error)
            
            valPreds = self.predict(X_val)
            acc = self.CheckAccuracy(valPreds, Y_val)
            print(acc, i)
            
            # Saving the best model and also checks for Early_Stopping
            if acc > bestAcc:
                bestAcc = acc
                badEpoch = 0
                self.bestW = self.w
            else:
                badEpoch += 1

            if badEpoch >= early_stopping:
                self.w = self.bestW
                accuracies = []
                for col in dfTrain.columns[4:]:
                    tempPredictions = self.predict(dic[col][0])
                    currentAccuracy = self.CheckAccuracy(tempPredictions, dic[col][1])
                    accuracies.append([currentAccuracy, col])
                accuracies = np.array(accuracies)
                print(accuracies)
                
                print("Stopped cause of bad Epoch in iteration: ", i)
                break
        return self

In [96]:
# Do not want words with overall less than 3 appearances to count, generating vector of size 200 for each word
modelw2v = Word2Vec(X_trainForw2v, min_count=3, size=50, iter=60)

In [98]:
def TransformToEmbedding(model, data):
    transformed = np.zeros((len(data), model.vector_size))
    for i, sentence in enumerate(data):
        currentLst = []
        for word in sentence:
            if word in model.wv.vocab.keys():
                currentLst.append(model.wv[word])
        if (len(currentLst) > 0):
            currentLst = np.mean(np.array(currentLst), axis=0)
        else:
            currentLst = np.zeros(model.vector_size)
        transformed[i] = np.array(currentLst)
    return np.array(transformed)


X_trainForw2vTransformed = TransformToEmbedding(modelw2v, X_trainForw2v)
X_testForw2vTransformed = TransformToEmbedding(modelw2v, X_testForw2v)
X_valForw2vTransformed = TransformToEmbedding(modelw2v, X_valForw2v)

for i, ind in enumerate(training_data.index):
    training_data['comment_text'][ind] = X_trainForw2vTransformed[i]
    
for i, ind in enumerate(test_data.index):
    test_data['comment_text'][ind] = X_testForw2vTransformed[i]

for i, ind in enumerate(validation_data.index):
    validation_data['comment_text'][ind] = X_valForw2vTransformed[i]
    
    
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['comment_text'][ind] = X_trainForw2vTransformed[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['comment_text'][ind] = X_testForw2vTransformed[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_data['comment_text'][ind] = X_valForw2vTransformed[i]


In [51]:


modelEmbed = AdalineGDTrainWorst(learning_rate = 0.01, n_iter = 300)

modelEmbed.fit(training_data, validation_data)

predictions = modelEmbed.predict(testnice)

Y_test = np.array(testLabels.values.tolist())
Y_test = Y_test.reshape(len(Y_test), 1)

print(sum(predictions)/len(predictions))
print(sum(Y_test)/len(Y_test))
print("Accuracy on testSet after round", modelEmbed.CheckAccuracy(predictions, Y_test))


[['0.180671845051952' 'male']
 ['0.16007166225268735' 'female']
 ['0.2421875' 'transgender']
 ['0.2421875' 'other_gender']]
0.265 0
0.735 1


  preds = np.round(1/(1+np.exp(-output)))


0.265 2
0.735 3
0.265 4
[['0.819328154948048' 'male']
 ['0.8399283377473127' 'female']
 ['0.7578125' 'transgender']
 ['0.7578125' 'other_gender']]
Stopped cause of bad Epoch in iteration:  4
[0.]
[0.739]
Accuracy on testSet after round 0.261


In [22]:
training_data

Unnamed: 0,comment_text,split,na_gender,toxicity,male,female,transgender,other_gender
3,"[-0.0089684455, 0.6426031, -0.3159568, -0.1167...",train,0,1.0,1,1,0,0
13,"[-0.17599669, 0.30832914, 0.080708444, -0.3494...",train,0,1.0,1,0,0,0
21,"[-0.1606114, 0.13993065, -0.4601244, -0.287369...",train,0,0.0,1,0,0,0
31,"[0.25308475, 0.079606794, -0.03677589, -0.0845...",train,0,1.0,1,1,0,0
46,"[-0.013226508, 0.24738197, 0.011026988, 0.2162...",train,0,1.0,0,1,0,0
...,...,...,...,...,...,...,...,...
188927,"[0.031083155, 0.119434625, -0.11237992, 0.2976...",train,0,0.0,1,0,0,0
188930,"[0.0034725324, -0.11189864, -0.42127734, 0.012...",train,0,0.0,0,1,0,0
188972,"[-0.21764557, 0.3955124, -0.17079373, -0.22168...",train,0,0.0,0,1,0,0
188980,"[0.21808432, 0.5820606, -0.12626122, -0.082670...",train,0,0.0,0,1,0,0


In [12]:
class AdalineGDNormal():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.wArr = np.ones((n_iter, 2))
        self.w = w
        self.learning_rate = learning_rate
        self.n_iter = n_iter
    
    def fit(self, X_train, Y_train):
        # Making sure that arrays are numpy arrays
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        Y_train = np.array(list(map(float, Y_train)))
        # Initializing w vector using random normal distribution
        if self.w == None:
            self.w = np.random.uniform(-3, 3, len(X_train[0]))
        
        # Using n epochs
        for i in range(self.n_iter):
            self.wArr[i][0] = self.w[0]
            self.wArr[i][1] = self.w[1]
            
            # Updating w vector for each sample
            output = np.dot(X_train, self.w)
            error = (Y_train - output)            
            self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)            
        return self
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        # Using the sign function as activation function since labels are {1, -1}
        preds = np.dot(X_test, self.w)
        
        for i in range(len(preds)):
            if preds[i] <= 0:
                preds[i] = 0
            else:
                preds[i] = 1
        return preds

In [40]:
#from sklearn.linear_model import LinearRegression

træningNice
valNice
testnice



Y_train = np.array(trainingLabels.values.tolist()).reshape(len(trainingLabels), 1)

model = AdalineGDNormal(learning_rate = 0.01, n_iter = 100)
model.fit(X_trainForw2vTransformed, Y_train)

Y_test = np.array(testLabels.values.tolist())
predictions = model.predict(X_testForw2vTransformed)


def CheckAccuracy(predictions, labels):
    acc = 0.0
    for i in range(len(predictions)):
        if (predictions[i] == round(labels[i])):
            acc += 1
    return acc/len(predictions)

CheckAccuracy(predictions, Y_test)


2210.0
1760.0


0.5586666666666666