In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import CountVectorizer
from wilds.common.data_loaders import get_train_loader
import torchvision.transforms as transforms
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec

def CleanText(text):
    text = text.lower() #Turn all text entries into lower-case
    text = re.sub(r'''(https?:\/\/www\.|https?:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,3}[-a-zA-Z0-9()@:%_\+.~#?&\//=<>]*''', "<URL>", text)
    #Replace URL with tag
    text = re.sub(r'''[0-9]+[/\-.]+[0-9]+[/\-.]+[0-9]+''', "<DATE>", text) #Replace dates with tag
    text = re.sub(r'''[a-z0-9._%+-]+\@[a-z0-9.-]+[a-z0-9]\.[a-z]{1,}''', "<EMAIL>", text)
    text = re.sub(r'''[0-9]+''', "<NUM>", text) #Replace numbers with tag
    
    text = re.sub(r'''[.|,|!|?|\'|\''|\"|\n|\t|\-|\(|\)]''', '', text)
    text = re.sub(r'''^\s+|\s+$''', '', text) #Remove whitespaces at the end of string
    text = re.sub(r'''[ ][ ]+|_''', " ", text) #Remove multiple whitespace
    return text

df = pd.read_csv("all_data_with_identities.csv")

In [2]:
df = df.loc[:, ["comment_text", "split", "na_gender", "toxicity", "male", "female", "transgender", "other_gender"]]

In [3]:
training_data = df[df['split'] == 'train']
training_data = training_data[training_data['na_gender'] == 0][:30000]

test_data = df[df['split'] == 'test'][:3000]
validation_data = df[df['split'] == 'val']

training_data['comment_text'] = training_data['comment_text'].apply(lambda text: CleanText(text))
training_data['male'] = training_data['male'].apply(lambda x: round(x))
training_data['female'] = training_data['female'].apply(lambda x: round(x))
training_data['transgender'] = training_data['transgender'].apply(lambda x: round(x))
training_data['other_gender'] = training_data['transgender'].apply(lambda x: round(x))

test_data['comment_text'] = test_data['comment_text'].apply(lambda text: CleanText(text))

In [4]:
trainingComments = training_data['comment_text']
trainingLabels = training_data['toxicity']
trainingComments = trainingComments.values.tolist()

testComments = test_data['comment_text']
testComments = testComments.values.tolist()
testLabels = test_data['toxicity']

valComments = validation_data['comment_text']
valComments = valComments.values.tolist()
valLabels = validation_data['toxicity']

In [73]:
class AdalineGDTrainWorst():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.w = w
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        
    def CheckAccuracy(self, predictions, labels):
        acc = 0.0
        for i in range(len(predictions)):
            if (predictions[i] == round(labels[i])):
                acc += 1
        return acc/len(predictions)
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        # Using the sign function as activation function since labels are {1, -1}
        preds = np.dot(X_test, self.w)
        
        for i in range(len(preds)):
            if preds[i] <= 0:
                preds[i] = 0
            else:
                preds[i] = 1
        return preds
        
    def fit(self, df):
        # Making sure that arrays are numpy arrays
        
        X_train = df['comment_text'].values.tolist()
        Y_train = df['toxicity'].values.tolist()
        
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        
        # Initializing w vector using random normal distribution
        if self.w == None:
            #self.w = np.random.normal(0, 0.1, len(X_train[0])).reshape(len(X_train[0]), 1)
            self.w = np.zeros(len(X_train[0])).reshape(len(X_train[0]), 1)
        
        # Updating w vector for each sample
        output = np.dot(X_train, self.w)
        error = (Y_train - output)            
        self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)
        
        dic = {}
        for col in df.columns[4:]:
            tempSet = df[df[col] == 1]
            tempLabels = tempSet['toxicity'].values.tolist()
            tempVecs = tempSet['comment_text'].values.tolist()
            dic[col] = [tempVecs, tempLabels]
        
        # Using n epochs
        for i in range(self.n_iter):
            accuracies = []
            for col in df.columns[4:]:
                tempPredictions = self.predict(dic[col][0])
                currentAcc = self.CheckAccuracy(tempPredictions, dic[col][1])
                accuracies.append([currentAcc, col])
            accuracies = np.array(accuracies)
            
            if i == 0:
                print(accuracies)
            
            worstCol = accuracies[np.argmin(accuracies[:,0])][1]
            #print(accuracies[np.argmin(accuracies[:,0])])
            X_train = dic[worstCol][0]
            Y_train = dic[worstCol][1]
            
            X_train = np.array(X_train).reshape((len(X_train), -1))
            Y_train = np.array(Y_train).reshape((len(Y_train), 1))
            
            # Updating w vector for each sample
            output = np.dot(X_train, self.w)
            error = (Y_train - output)
            self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)
        print(accuracies)
        return self

In [67]:
# Preparing reviews in list of lists format
X_trainForw2v = []
X_testForw2v = []

for sentence in trainingComments:
    X_trainForw2v.append(sentence.split(' '))

for sentence in testComments:
    X_testForw2v.append(sentence.split(' '))

X_trainForw2v = np.array(X_trainForw2v)
X_testForw2v = np.array(X_testForw2v)

  X_trainForw2v = np.array(X_trainForw2v)
  X_testForw2v = np.array(X_testForw2v)


In [68]:
# Do not want words with overall less than 3 appearances to count, generating vector of size 200 for each word
modelw2v = Word2Vec(X_trainForw2v, min_count=2, size=100, iter=60)

In [69]:
len(modelw2v.wv.vocab)

31612

In [70]:
def TransformToEmbedding(model, data):
    transformed = np.zeros((len(data), model.vector_size))
    for i, sentence in enumerate(data):
        currentLst = []
        for word in sentence:
            if word in model.wv.vocab.keys():
                currentLst.append(model.wv[word])
        if (len(currentLst) > 0):
            currentLst = np.mean(np.array(currentLst), axis=0)
        else:
            currentLst = np.zeros(model.vector_size)
        transformed[i] = np.array(currentLst)
    return np.array(transformed)


X_trainForw2vTransformed = TransformToEmbedding(modelw2v, X_trainForw2v)
X_testForw2vTransformed = TransformToEmbedding(modelw2v, X_testForw2v)

for i, ind in enumerate(training_data.index):
    training_data['comment_text'][ind] = X_trainForw2vTransformed[i]
    
for i, ind in enumerate(test_data.index):
    test_data['comment_text'][ind] = X_testForw2vTransformed[i]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['comment_text'][ind] = X_trainForw2vTransformed[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['comment_text'][ind] = X_testForw2vTransformed[i]


In [85]:
modelEmbed = AdalineGDTrainWorst(learning_rate = 0.5, n_iter = 30)

modelEmbed.fit(training_data)

predictions = modelEmbed.predict(X_testForw2vTransformed)

Y_test = np.array(testLabels.values.tolist())

print(sum(Y_test))
print(np.sum(predictions))
#print(np.sum(predictions))

modelEmbed.CheckAccuracy(predictions, Y_test)

[['0.14060917133788073' 'male']
 ['0.12576993945775203' 'female']
 ['0.18686868686868688' 'transgender']
 ['0.18686868686868688' 'other_gender']]
[['0.8587154724116972' 'male']
 ['0.8741247696762305' 'female']
 ['0.8118686868686869' 'transgender']
 ['0.8118686868686869' 'other_gender']]
1883.9540683570904
2969.0


0.7303333333333333

In [11]:
training_data

Unnamed: 0,comment_text,split,na_gender,toxicity,male,female,transgender,other_gender
3,"[1.5030977725982666, 0.03387044370174408, -0.1...",train,0,0.545455,1,1,0,0
13,"[1.2737869024276733, -0.11399838328361511, -0....",train,0,0.800000,1,0,0,0
21,"[0.20370736718177795, -1.3311550617218018, 0.0...",train,0,0.363636,1,0,0,0
31,"[1.0545250177383423, -0.7889888286590576, -0.8...",train,0,0.800000,1,1,0,0
46,"[0.4880611300468445, 0.0715145617723465, -0.40...",train,0,0.594595,0,1,0,0
...,...,...,...,...,...,...,...,...
266691,"[0.4320566654205322, 0.22556768357753754, -0.3...",train,0,0.200000,0,1,0,0
266704,"[0.8448939323425293, -0.14512313902378082, -0....",train,0,0.200000,1,1,0,0
266720,"[0.7907209992408752, -0.10068434476852417, -0....",train,0,0.200000,1,1,0,0
266725,"[0.6033808588981628, 0.13126841187477112, -0.3...",train,0,0.200000,0,1,0,0


In [12]:
class AdalineGDNormal():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.wArr = np.ones((n_iter, 2))
        self.w = w
        self.learning_rate = learning_rate
        self.n_iter = n_iter
    
    def fit(self, X_train, Y_train):
        # Making sure that arrays are numpy arrays
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        Y_train = np.array(list(map(float, Y_train)))
        # Initializing w vector using random normal distribution
        if self.w == None:
            self.w = np.random.uniform(-3, 3, len(X_train[0]))
        
        # Using n epochs
        for i in range(self.n_iter):
            self.wArr[i][0] = self.w[0]
            self.wArr[i][1] = self.w[1]
            
            # Updating w vector for each sample
            output = np.dot(X_train, self.w)
            error = (Y_train - output)            
            self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)            
        return self
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        # Using the sign function as activation function since labels are {1, -1}
        preds = np.dot(X_test, self.w)
        
        for i in range(len(preds)):
            if preds[i] <= 0:
                preds[i] = 0
            else:
                preds[i] = 1
        return preds

In [40]:
#from sklearn.linear_model import LinearRegression

Y_train = np.array(trainingLabels.values.tolist()).reshape(len(trainingLabels), 1)

model = AdalineGDNormal(learning_rate = 0.01, n_iter = 100)

model.fit(X_trainForw2vTransformed, Y_train)

Y_test = np.array(testLabels.values.tolist())

predictions = model.predict(X_testForw2vTransformed)

print(sum(np.round(Y_test)))
print(sum(predictions))

def CheckAccuracy(predictions, labels):
    acc = 0.0
    for i in range(len(predictions)):
        if (predictions[i] == round(labels[i])):
            acc += 1
    return acc/len(predictions)

CheckAccuracy(predictions, Y_test)


2210.0
1760.0


0.5586666666666666