In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import CountVectorizer
from wilds.common.data_loaders import get_train_loader
import torchvision.transforms as transforms
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec

def CleanText(text):
    text = text.lower() #Turn all text entries into lower-case
    text = re.sub(r'''(https?:\/\/www\.|https?:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,3}[-a-zA-Z0-9()@:%_\+.~#?&\//=<>]*''', "<URL>", text)
    #Replace URL with tag
    text = re.sub(r'''[0-9]+[/\-.]+[0-9]+[/\-.]+[0-9]+''', "<DATE>", text) #Replace dates with tag
    text = re.sub(r'''[a-z0-9._%+-]+\@[a-z0-9.-]+[a-z0-9]\.[a-z]{1,}''', "<EMAIL>", text)
    text = re.sub(r'''[0-9]+''', "<NUM>", text) #Replace numbers with tag
    
    text = re.sub(r'''[.|,|!|?|\'|\''|\"|\n|\t|\-]''', '', text)
    text = re.sub(r'''^\s+|\s+$''', '', text) #Remove whitespaces at the end of string
    text = re.sub(r'''[ ][ ]+|_''', " ", text) #Remove multiple whitespace
    return text

def CheckAccuracy(predictions, labels):
    acc = 0.0
    for i in range(len(predictions)):
        if (predictions[i] == labels[i]):
            acc += 1
    return acc/len(predictions)

df = pd.read_csv("all_data_with_identities.csv")

In [70]:
training_data = df[df['split'] == 'train']
test_data = df[df['split'] == 'test']
validation_data = df[df['split'] == 'val']

In [71]:
trainingComments = training_data['comment_text']
trainingLabels = training_data['toxicity']

testComments = test_data['comment_text']
testLabels = test_data['toxicity']

valComments = validation_data['comment_text']
valLabels = validation_data['toxicity']


def PrepareData(comment_text, toxicity_labels):
    texts = []
    labels = []
    for txt, label in zip(comment_text, toxicity_labels):
        if(isinstance(txt, str) and isinstance(label, float)):
            texts.append(CleanText(txt))
            labels.append(round(label))
    return texts, labels

X_train, Y_train = PrepareData(trainingComments, trainingLabels)
X_test, Y_test = PrepareData(testComments, testLabels)

In [72]:
print(len(X_train))
print(len(Y_train))
print(len(X_test))
print(len(Y_test))

print(X_train[3])


X_train = np.array(X_train)[:10000]
Y_train = np.array(Y_train)[:10000]

X_test = np.array(X_test)[:2000]
Y_test = np.array(Y_test)[:2000]

269038
269038
133782
133782
lets get the black folks and the white folks at each others throats


In [73]:
# Using count vectorizer with predefined stopwords
cv = CountVectorizer(stop_words='english')

# Creating vocabulary by fitting on trainingdata and transforming to the vocabulary vector
count_vectorTrainingSet = cv.fit_transform(X_train)

# Transforming test data to the vocabulary vector
count_vectorTestSet = cv.transform(X_test)

# Using the count_vector to create a matrix containing vectors of each review
def createVectorArr(count_vector):    
    vectors = []
    for i in range(len(count_vector.toarray())):
        vectors.append(count_vector[i].toarray()[0])
    vectors = np.array(vectors)
    return vectors

X_train_vectors = createVectorArr(count_vectorTrainingSet)
X_test_vectors = createVectorArr(count_vectorTestSet)

# Using sklearn linear Logistic regression
clf = LogisticRegression(random_state=0)
clf.fit(X_train_vectors, Y_train)
predictions = clf.predict(X_test_vectors)

print("Accuracy on TestSet: ", CheckAccuracy(predictions, Y_test))


Accuracy on TestSet:  0.7535


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:
class AdalineSGD():

    def __init__(self, learning_rate = 0.01, n_iter = 20, shuffle = True):
        self.w = None
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.shuffle = shuffle
    
    def fit(self, X_train, Y_train):
        # Making sure that arrays are numpy arrays
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        # prepend a column of ones
        ones = np.ones((X_train.shape[0], 1))
        X_train = np.concatenate((ones, X_train), axis=1)
        Y_train = np.array(list(map(float, Y_train)))
        
        # Initializing w vector using random normal distribution
        self.w = np.random.normal(loc=0.0, scale=0.01, size=len(X_train[0]))
        
        # Generating a cost array for visualisation
        self.costArr_ = []
        
        # Using n epochs       
        for i in range(self.n_iter):
            
            # Random shuffling the training data since stochastic gradient descent
            if self.shuffle:
                rand = np.random.permutation(len(X_train))
                X_train = X_train[rand]
                Y_train = Y_train[rand]
            
            tempCostArr = []
            
            # Updating w vector for each sample         
            for xi, label in zip(X_train, Y_train):
                output = np.dot(xi, self.w)
                error = (label - output)
                self.w += self.learning_rate * np.dot(xi, error)                
                tempCostArr.append(0.5 * error**2)
            avg_cost = sum(tempCostArr)/len(Y_train)
            self.costArr_.append(avg_cost)
        return self
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        
        # Adding ones in columns
        ones = np.ones((X_test.shape[0], 1))
        X_test = np.concatenate((ones, X_test), axis=1)
        
        # Using the sign function as activation function since labels are {1, -1}
        return np.sign(np.matmul(X_test, self.w))

In [75]:
# Preparing reviews in list of lists format
X_trainForw2v = []
X_testForw2v = []

for sentence in X_train:
    X_trainForw2v.append(sentence.split(' '))

for sentence in X_test:
    X_testForw2v.append(sentence.split(' '))

X_trainForw2v = np.array(X_trainForw2v)
X_testForw2v = np.array(X_testForw2v)

  X_trainForw2v = np.array(X_trainForw2v)
  X_testForw2v = np.array(X_testForw2v)


In [108]:
# Do not want words with overall less than 3 appearances to count, generating vector of size 200 for each word
modelw2v = Word2Vec(X_trainForw2v, min_count=3, size=100, iter=60)

In [109]:
def TransformToEmbedding(model, data):
    transformed = np.zeros((len(data), model.vector_size))
    for i, sentence in enumerate(data):
        currentLst = []
        for word in sentence:
            if word in model.wv.vocab.keys():
                currentLst.append(model.wv[word])
        if (len(currentLst) > 0):
            currentLst = np.mean(np.array(currentLst), axis=0)
        else:
            currentLst = np.zeros(model.vector_size)
        transformed[i] = np.array(currentLst)
    return np.array(transformed)


X_trainForw2vTransformed = TransformToEmbedding(modelw2v, X_trainForw2v)
X_testForw2vTransformed = TransformToEmbedding(modelw2v, X_testForw2v)

In [110]:

# Using sklearn linear Logistic regression
clf1 = LogisticRegression(random_state=0)
clf1.fit(X_trainForw2vTransformed, Y_train)
predictions = clf1.predict(X_testForw2vTransformed)
print("Accuracy on TestSet: ", CheckAccuracy(predictions, Y_test))



Accuracy on TestSet:  0.7345
