In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import CountVectorizer
from wilds.common.data_loaders import get_train_loader
import torchvision.transforms as transforms
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec

def CleanText(text):
    text = text.lower() #Turn all text entries into lower-case
    text = re.sub(r'''(https?:\/\/www\.|https?:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,3}[-a-zA-Z0-9()@:%_\+.~#?&\//=<>]*''', "<URL>", text)
    #Replace URL with tag
    text = re.sub(r'''[0-9]+[/\-.]+[0-9]+[/\-.]+[0-9]+''', "<DATE>", text) #Replace dates with tag
    text = re.sub(r'''[a-z0-9._%+-]+\@[a-z0-9.-]+[a-z0-9]\.[a-z]{1,}''', "<EMAIL>", text)
    text = re.sub(r'''[0-9]+''', "<NUM>", text) #Replace numbers with tag
    
    text = re.sub(r'''[.|,|!|?|\'|\''|\"|\n|\t|\-|\(|\)]''', '', text)
    text = re.sub(r'''^\s+|\s+$''', '', text) #Remove whitespaces at the end of string
    text = re.sub(r'''[ ][ ]+|_''', " ", text) #Remove multiple whitespace
    return text

df = pd.read_csv("all_data_with_identities.csv")

# Find columns needed

In [2]:
df = df.loc[:, ["comment_text", "split", "na_gender", "toxicity", "male", "female", "transgender", "other_gender"]]


In [3]:
training_data = df[df['split'] == 'train']
training_data = training_data[training_data['na_gender'] == 0][:5000]

test_data = df[df['split'] == 'test'][:500]
validation_data = df[df['split'] == 'val']

training_data['comment_text'] = training_data['comment_text'].apply(lambda text: CleanText(text))
training_data['male'] = training_data['male'].apply(lambda x: round(x))
training_data['female'] = training_data['female'].apply(lambda x: round(x))
training_data['transgender'] = training_data['transgender'].apply(lambda x: round(x))
training_data['other_gender'] = training_data['transgender'].apply(lambda x: round(x))

test_data['comment_text'] = test_data['comment_text'].apply(lambda text: CleanText(text))


In [4]:
trainingComments = training_data['comment_text']
trainingLabels = training_data['toxicity']
trainingComments = trainingComments.values.tolist()

testComments = test_data['comment_text']
testComments = testComments.values.tolist()
testLabels = test_data['toxicity']

valComments = validation_data['comment_text']
valComments = valComments.values.tolist()
valLabels = validation_data['toxicity']



In [5]:
# Using count vectorizer with predefined stopwords
cv = CountVectorizer(stop_words='english')

# Creating vocabulary by fitting on trainingdata and transforming to the vocabulary vector
count_vectorTrainingSet = cv.fit_transform(trainingComments)

# Transforming test data to the vocabulary vector
count_vectorTestSet = cv.transform(testComments)

# Using the count_vector to create a matrix containing vectors of each review
def createVectorArr(count_vector):    
    vectors = []
    for i in range(len(count_vector.toarray())):
        vectors.append(count_vector[i].toarray()[0])
    vectors = np.array(vectors)
    return vectors

X_train_vectors = createVectorArr(count_vectorTrainingSet)
X_test_vectors = createVectorArr(count_vectorTestSet)

for i, ind in enumerate(training_data.index):
    training_data['comment_text'][ind] = X_train_vectors[i]
    
for i, ind in enumerate(test_data.index):
    test_data['comment_text'][ind] = X_test_vectors[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['comment_text'][ind] = X_train_vectors[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['comment_text'][ind] = X_test_vectors[i]


In [6]:
class AdalineGD():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.w = w
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        
    def CheckAccuracy(self, predictions, labels):
        acc = 0.0
        for i in range(len(predictions)):
            if (predictions[i] == round(labels[i])):
                acc += 1
        return acc/len(predictions)
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        # Using the sign function as activation function since labels are {1, -1}
        preds = np.dot(X_test, self.w)
        
        for i in range(len(preds)):
            if preds[i] <= 0:
                preds[i] = 0
            else:
                preds[i] = 1
        return preds
        
    def fit(self, df):
        # Making sure that arrays are numpy arrays
        
        X_train = df['comment_text'].values.tolist()
        Y_train = df['toxicity'].values.tolist()
        
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        
        # Initializing w vector using random normal distribution
        if self.w == None:
            self.w = np.random.normal(0, 0.1, len(X_train[0])).reshape(len(X_train[0]), 1)
        
        # Updating w vector for each sample
        output = np.dot(X_train, self.w)
        error = (Y_train - output)            
        self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)
        
        dic = {}
        for col in df.columns[4:]:
            tempSet = df[df[col] == 1]
            tempLabels = tempSet['toxicity'].values.tolist()
            tempVecs = tempSet['comment_text'].values.tolist()
            dic[col] = [tempVecs, tempLabels]
        
        # Using n epochs
        for i in range(self.n_iter):
            accuracies = []
            for col in df.columns[4:]:
                tempPredictions = self.predict(dic[col][0])
                currentAcc = self.CheckAccuracy(tempPredictions, dic[col][1])
                accuracies.append([currentAcc, col])
            
            accuracies = np.array(accuracies)
            if i == 0:
                print(accuracies)
            
            worstCol = accuracies[np.argmin(accuracies[:,0])][1]
            print(accuracies[np.argmin(accuracies[:,0])])
            X_train = dic[worstCol][0]
            Y_train = dic[worstCol][1]
            
            X_train = np.array(X_train).reshape((len(X_train), -1))
            Y_train = np.array(Y_train).reshape((len(Y_train), 1))
            
            # Updating w vector for each sample
            output = np.dot(X_train, self.w)
            error = (Y_train - output)
            self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)
        print(accuracies)
        return self
    
    

In [7]:
model = AdalineGD(learning_rate = 0.5, n_iter = 20)
model.fit(training_data)
predictions = model.predict(X_test_vectors)
diller = np.array(testLabels.values.tolist())
model.CheckAccuracy(predictions, diller)

[['0.5473083197389886' 'male']
 ['0.5381818181818182' 'female']
 ['0.6158940397350994' 'transgender']
 ['0.6158940397350994' 'other_gender']]
['0.5381818181818182' 'female']
['0.5441322314049587' 'female']
['0.5461157024793388' 'female']
['0.5484297520661157' 'female']
['0.5527272727272727' 'female']
['0.5553719008264463' 'female']
['0.5573553719008264' 'female']
['0.5606611570247934' 'female']
['0.5626446280991736' 'female']
['0.5666115702479338' 'female']
['0.5715702479338843' 'female']
['0.572892561983471' 'female']
['0.5742148760330579' 'female']
['0.5748760330578513' 'female']
['0.5778512396694215' 'female']
['0.5798347107438017' 'female']
['0.5823817292006526' 'male']
['0.5828099173553719' 'female']
['0.5874380165289256' 'female']
['0.5894214876033058' 'female']
[['0.5938009787928222' 'male']
 ['0.5894214876033058' 'female']
 ['0.6158940397350994' 'transgender']
 ['0.6158940397350994' 'other_gender']]


0.602

In [16]:
print(np.sum(np.round(diller)))
print(len(predictions))
print(np.sum(predictions))

372.0
500
351.0
