In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import CountVectorizer
from wilds.common.data_loaders import get_train_loader
import torchvision.transforms as transforms
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec

def CleanText(text):
    text = text.lower() #Turn all text entries into lower-case
    text = re.sub(r'''(https?:\/\/www\.|https?:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,3}[-a-zA-Z0-9()@:%_\+.~#?&\//=<>]*''', "<URL>", text)
    #Replace URL with tag
    text = re.sub(r'''[0-9]+[/\-.]+[0-9]+[/\-.]+[0-9]+''', "<DATE>", text) #Replace dates with tag
    text = re.sub(r'''[a-z0-9._%+-]+\@[a-z0-9.-]+[a-z0-9]\.[a-z]{1,}''', "<EMAIL>", text)
    text = re.sub(r'''[0-9]+''', "<NUM>", text) #Replace numbers with tag
    
    text = re.sub(r'''[.|,|!|?|\'|\''|\"|\n|\t|\-|\(|\)]''', '', text)
    text = re.sub(r'''^\s+|\s+$''', '', text) #Remove whitespaces at the end of string
    text = re.sub(r'''[ ][ ]+|_''', " ", text) #Remove multiple whitespace
    return text

df = pd.read_csv("all_data_with_identities.csv")

In [2]:
df = df.loc[:, ["comment_text", "split", "na_gender", "toxicity", "male", "female", "transgender", "other_gender"]]


NameError: name 'df' is not defined

In [19]:
training_data = df[df['split'] == 'train']
training_data = training_data[training_data['na_gender'] == 0][:70000]

test_data = df[df['split'] == 'test'][:15000]
validation_data = df[df['split'] == 'val'][:15000]

training_data['comment_text'] = training_data['comment_text'].apply(lambda text: CleanText(text))
training_data['male'] = training_data['male'].apply(lambda x: round(x))
training_data['female'] = training_data['female'].apply(lambda x: round(x))
training_data['transgender'] = training_data['transgender'].apply(lambda x: round(x))
training_data['other_gender'] = training_data['transgender'].apply(lambda x: round(x))

test_data['comment_text'] = test_data['comment_text'].apply(lambda text: CleanText(text))
validation_data['comment_text'] = validation_data['comment_text'].apply(lambda text: CleanText(text))

In [21]:
trainingComments = training_data['comment_text']
trainingLabels = training_data['toxicity']
trainingComments = trainingComments.values.tolist()

testComments = test_data['comment_text']
testComments = testComments.values.tolist()
testLabels = test_data['toxicity']

valComments = validation_data['comment_text']
valComments = valComments.values.tolist()
valLabels = validation_data['toxicity']

In [77]:
class AdalineGDTrainWorst():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.w = w
        self.bestW = None
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        
    def CheckAccuracy(self, predictions, labels):
        print("Percentage of toxic in predictions: ", sum(predictions)/len(predictions))
        print("Percentage of toxic in labels: ", sum(labels)/len(labels))
        acc = 0.0
        for i in range(len(predictions)):
            if (np.round(predictions[i]) == np.round(labels[i])):
                acc += 1
        return acc/len(predictions)
    
    def CheckLoss(self, predictions, labels):
        predictions = np.array(predictions).reshape(len(predictions), 1)
        labels = np.array(labels).reshape(len(labels), 1)
        loss = np.sum(np.absolute(np.subtract(predictions, labels)))
        return loss/len(predictions)
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        
        preds = np.dot(X_test, self.w)
        return preds
        
    def fit(self, dfTrain, dfVal):
        # Making sure that arrays are numpy arrays
        
        X_train = dfTrain['comment_text'].values.tolist()
        Y_train = dfTrain['toxicity'].values.tolist()
        
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        X_val = dfVal['comment_text'].values.tolist()
        Y_val = dfVal['toxicity'].values.tolist()
        
        X_val = np.array(X_val).reshape((len(X_val), -1))
        Y_val = np.array(Y_val).reshape((len(Y_val), 1))
        
        # Initializing w vector using random normal distribution
        if self.w == None:
            self.w = np.random.normal(0, 0.1, len(X_train[0])).reshape(len(X_train[0]), 1)
            #self.w = np.zeros(len(X_train[0])).reshape(len(X_train[0]), 1)
        
        # Updating w vector for each sample
        output = np.dot(X_train, self.w)
        error = (Y_train - output)            
        self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)
        
        dic = {}
        for col in dfTrain.columns[4:]:
            tempSet = dfTrain[dfTrain[col] == 1]
            tempLabels = tempSet['toxicity'].values.tolist()
            tempVecs = tempSet['comment_text'].values.tolist()
            dic[col] = [tempVecs, tempLabels]
        
        
        tmpPredictions = self.predict(X_val)
        for i in range(len(tmpPredictions)):
            if tmpPredictions[i] < 0:
                tmpPredictions[i] = 0
            if tmpPredictions[i] > 1:
                tmpPredictions[i] = 1
        
        bestLoss = self.CheckLoss(tmpPredictions, Y_val)
        badEpoch = 0
        self.bestW = self.w
        early_stopping = 10
        
        # Using n epochs
        for i in range(self.n_iter):
            losses = []
            for col in dfTrain.columns[4:]:
                tempPredictions = self.predict(dic[col][0])
                currentLoss = self.CheckLoss(tempPredictions, dic[col][1])
                losses.append([currentLoss, col])
            losses = np.array(losses)            
            worstCol = losses[np.argmax(losses[:,0])][1]
            
            X_train = dic[worstCol][0]
            Y_train = dic[worstCol][1]
            
            X_train = np.array(X_train).reshape((len(X_train), -1))
            Y_train = np.array(Y_train).reshape((len(Y_train), 1))
            
            # Updating w vector for each sample
            output = np.dot(X_train, self.w)
            #print(output)
            error = (Y_train - output)
            self.w += (1/len(Y_train)) * (self.learning_rate) * np.dot(X_train.T, error)
            
            tmpPredictions = self.predict(X_val)
            for i in range(len(tmpPredictions)):
                if tmpPredictions[i] < 0:
                    tmpPredictions[i] = 0
                if tmpPredictions[i] > 1:
                    tmpPredictions[i] = 1
            l = self.CheckLoss(tmpPredictions, Y_val)
            print(l)
            # Saving the best model and also checks for Early_Stopping
            
            if l < bestLoss:
                bestLoss = l
                badEpoch = 0
                self.bestW = self.w
            else:
                badEpoch += 1

            if badEpoch >= early_stopping:
                self.w = self.bestW
                print("Stopped cause of bad Epoch: ", badEpoch)
                break
            
        return self

In [33]:
# Preparing reviews in list of lists format
X_trainForw2v = []
X_testForw2v = []
X_valForw2v = []

for sentence in trainingComments:
    X_trainForw2v.append(sentence.split(' '))

for sentence in testComments:
    X_testForw2v.append(sentence.split(' '))
    
for sentence in valComments:
    X_valForw2v.append(sentence.split(' '))

X_trainForw2v = np.array(X_trainForw2v)
X_testForw2v = np.array(X_testForw2v)
X_valForw2v = np.array(X_valForw2v)

  X_trainForw2v = np.array(X_trainForw2v)
  X_testForw2v = np.array(X_testForw2v)
  X_valForw2v = np.array(X_valForw2v)


In [24]:
# Do not want words with overall less than 3 appearances to count, generating vector of size 200 for each word
modelw2v = Word2Vec(X_trainForw2v, min_count=3, size=50, iter=60)

In [25]:
len(modelw2v.wv.vocab)

32197

In [35]:
def TransformToEmbedding(model, data):
    transformed = np.zeros((len(data), model.vector_size))
    for i, sentence in enumerate(data):
        currentLst = []
        for word in sentence:
            if word in model.wv.vocab.keys():
                currentLst.append(model.wv[word])
        if (len(currentLst) > 0):
            currentLst = np.mean(np.array(currentLst), axis=0)
        else:
            currentLst = np.zeros(model.vector_size)
        transformed[i] = np.array(currentLst)
    return np.array(transformed)


X_trainForw2vTransformed = TransformToEmbedding(modelw2v, X_trainForw2v)
X_testForw2vTransformed = TransformToEmbedding(modelw2v, X_testForw2v)
X_valForw2vTransformed = TransformToEmbedding(modelw2v, X_valForw2v)

for i, ind in enumerate(training_data.index):
    training_data['comment_text'][ind] = X_trainForw2vTransformed[i]
    
for i, ind in enumerate(test_data.index):
    test_data['comment_text'][ind] = X_testForw2vTransformed[i]

for i, ind in enumerate(validation_data.index):
    validation_data['comment_text'][ind] = X_valForw2vTransformed[i]
    
    
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['comment_text'][ind] = X_trainForw2vTransformed[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['comment_text'][ind] = X_testForw2vTransformed[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_data['comment_text'][ind] = X_valForw2vTransformed[i]


In [81]:
modelEmbed = AdalineGDTrainWorst(learning_rate = 0.1, n_iter = 200)

modelEmbed.fit(training_data, validation_data)

predictions = modelEmbed.predict(X_testForw2vTransformed)

Y_test = np.array(testLabels.values.tolist()).reshape(len(Y_test), 1)


for i in range(len(predictions)):
    if predictions[i] < 0:
        predictions[i] = 0
    if predictions[i] > 1:
        predictions[i] = 1
        

print("Loss on testSet: ", modelEmbed.CheckLoss(predictions, Y_test))
print("Accuracy on testSet after round", modelEmbed.CheckAccuracy(predictions, Y_test))


0.2584266759471116
0.254321779989996
0.2668005967447339
0.25335802814688363
0.2615103696196011
0.26146477247520555
0.26066413933701565
0.25979737744921305
0.24525092577920118
0.2539012782793029
0.2541425007748766
0.2535847501388869
0.25296100170848973
0.2523085244815844
0.2365813538699851
0.24670697158232477
0.24722710645356424
0.24681917980009757
0.24632171853196103
0.23040585423193305
0.24108688633791467
0.24176162551957203
0.2414798861008681
0.2260824770823557
0.23667810473184098
0.2374674311682885
0.23730605471971636
0.23703979095790967
0.22123215460353005
0.23238132035079037
0.23330365652820423
0.23322594215116957
0.2178449138656266
0.22892454373988821
0.22991892003258496
0.2299329773271507
0.22983851925113394
0.21409135956980166
0.22563498277514227
0.22673305850915224
0.22680671303788913
0.21155202481889512
0.22289853905154364
0.22403909212452663
0.22418871015181607
0.2242260727222051
0.20876116075378903
0.22040671517179747
0.22162726672202662
0.22182828520765432
0.20684403384319

In [105]:
training_data

Unnamed: 0,comment_text,split,na_gender,toxicity,male,female,transgender,other_gender
3,"[0.06606747210025787, 0.24235686659812927, 1.2...",train,0,0.545455,1,1,0,0
13,"[0.2409689873456955, 0.4050166606903076, 0.091...",train,0,0.800000,1,0,0,0
21,"[1.4279813766479492, -0.09431479871273041, 0.8...",train,0,0.363636,1,0,0,0
31,"[0.6402150392532349, -0.12306588888168335, 0.9...",train,0,0.800000,1,1,0,0
46,"[0.23048871755599976, 0.21826037764549255, -0....",train,0,0.594595,0,1,0,0
...,...,...,...,...,...,...,...,...
266691,"[0.14579103887081146, 0.39261192083358765, 0.1...",train,0,0.200000,0,1,0,0
266704,"[0.2669728398323059, -0.21337410807609558, -0....",train,0,0.200000,1,1,0,0
266720,"[0.8897033929824829, 0.33627647161483765, 0.36...",train,0,0.200000,1,1,0,0
266725,"[0.4143710434436798, 0.19337496161460876, -0.0...",train,0,0.200000,0,1,0,0


In [12]:
class AdalineGDNormal():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.wArr = np.ones((n_iter, 2))
        self.w = w
        self.learning_rate = learning_rate
        self.n_iter = n_iter
    
    def fit(self, X_train, Y_train):
        # Making sure that arrays are numpy arrays
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        Y_train = np.array(list(map(float, Y_train)))
        # Initializing w vector using random normal distribution
        if self.w == None:
            self.w = np.random.uniform(-3, 3, len(X_train[0]))
        
        # Using n epochs
        for i in range(self.n_iter):
            self.wArr[i][0] = self.w[0]
            self.wArr[i][1] = self.w[1]
            
            # Updating w vector for each sample
            output = np.dot(X_train, self.w)
            error = (Y_train - output)            
            self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)            
        return self
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        # Using the sign function as activation function since labels are {1, -1}
        preds = np.dot(X_test, self.w)
        
        for i in range(len(preds)):
            if preds[i] <= 0:
                preds[i] = 0
            else:
                preds[i] = 1
        return preds

In [40]:
#from sklearn.linear_model import LinearRegression

Y_train = np.array(trainingLabels.values.tolist()).reshape(len(trainingLabels), 1)

model = AdalineGDNormal(learning_rate = 0.01, n_iter = 100)
model.fit(X_trainForw2vTransformed, Y_train)

Y_test = np.array(testLabels.values.tolist())
predictions = model.predict(X_testForw2vTransformed)


def CheckAccuracy(predictions, labels):
    acc = 0.0
    for i in range(len(predictions)):
        if (predictions[i] == round(labels[i])):
            acc += 1
    return acc/len(predictions)

CheckAccuracy(predictions, Y_test)


2210.0
1760.0


0.5586666666666666