In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import CountVectorizer
from wilds.common.data_loaders import get_train_loader
import torchvision.transforms as transforms
from sklearn.linear_model import LogisticRegression




In [6]:
df = pd.read_csv("all_data_with_identitiesEmbedded.csv")

In [7]:
def CleanText(text):
    text = re.sub(r'''[\[|\]]''', "", text).split()
    text = np.array(text, dtype="float64")
    return text


In [8]:
df = pd.read_csv("all_data_with_identitiesEmbedded.csv")

df = df.loc[:, ["comment_text", "split", "toxicity"]]
df['toxicity'] = df['toxicity'].apply(lambda score: np.round(score))
df['comment_text'] = df['comment_text'].apply(lambda text: CleanText(text))


In [9]:
# Updating values for training_data
training_data = df[df['split'] == 'train']

# Getting test_data
test_data = df[df['split'] == 'test']

# Getting validation_data
validation_data = df[df['split'] == 'val']

In [12]:
X_train = training_data['comment_text'].values.tolist()
Y_train = training_data['toxicity'].values.tolist()

X_test = test_data['comment_text'].values.tolist()
Y_test = test_data['toxicity'].values.tolist()

X_val = validation_data['comment_text'].values.tolist()
Y_val = validation_data['toxicity'].values.tolist()

In [67]:
class AdalineGD():
    def __init__(self, learning_rate = 0.01, n_iter = 20, w = None):
        self.w = w
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        
    def CheckAccuracy(self, predictions, labels):
        acc = 0.0
        for i in range(len(predictions)):
            #print(predictions[i],labels[i])
            if (predictions[i] == labels[i]):
                acc += 1
        return acc/len(predictions)
    
    def predict(self, X_test):
        # Making sure that array is numpy array
        X_test = np.array(X_test).reshape((len(X_test), -1))
        output = np.dot(X_test, self.w)
        # Sigmoid giving 0-1 results
        
        preds = np.round(1/(1+np.exp(-output)))
        return preds
        
    def fit(self, X_train, Y_train, X_val, Y_val):
        # Making sure that arrays are numpy arrays
        
        X_train = np.array(X_train).reshape((len(X_train), -1))
        Y_train = np.array(Y_train).reshape((len(Y_train), 1))
        
        X_val = np.array(X_val).reshape((len(X_val), -1))
        Y_val = np.array(Y_val).reshape((len(Y_val), 1))
        
        # Initializing w vector using random normal distribution
        if self.w == None:
            self.w = np.random.normal(0, 0.1, len(X_train[0])).reshape(len(X_train[0]), 1)
        
        #badEpoch = 0
        #self.bestW = self.w
        #bestAcc = 0
        #early_stopping = 5
        
        # Using n epochs
        for i in range(self.n_iter):
            output = np.dot(X_train, self.w)
            error = (Y_train - output)
            print(np.sum(error))
            self.w += (1/len(Y_train)) * self.learning_rate * np.dot(X_train.T, error)
        
            #tmpPreds = self.predict(X_val)
            
            #acc = self.CheckAccuracy(tmpPreds, Y_val)
            
            #print(acc)
            
            # Saving the best model and also checks for Early_Stopping
            #if acc > bestAcc:
            #    bestAcc = acc
            #    badEpoch = 0
            #    self.bestW = self.w
            #else:
            #    badEpoch += 1
            #
            #if badEpoch >= early_stopping:
            #    self.w = self.bestW                
            #    print("Stopped cause of bad Epoch in iteration: ", i)
            #    break
                
        return self
    
    

In [68]:
model = AdalineGD(learning_rate = 0.0001, n_iter = 500)

model.fit(X_train, Y_train, X_val, Y_val)

predictions = model.predict(X_test)

print(predictions)
Y_test = np.array(Y_test).reshape(len(Y_test), 1)
print(Y_test)
model.CheckAccuracy(predictions, Y_test)

185051.1394533141
182474.21247528173
179933.92377477358
177429.75234595552
174961.18459184375
172527.71421894868
170128.84213341709
167764.0763386519
165432.9318343862
163134.9305171943
160869.60108241634
158636.47892747776
156435.1060565842
154265.0309867708
152125.80865528807
150017.00032830465
147938.17351090832
145888.9018583867
143868.7650887697
141877.34889661573
139914.24486802355
137979.05039685318
136071.36860213766
134190.80824666942
132336.98365674488
130509.51464304916
128708.02642266714
126932.1495422025
125181.51980199081
123455.77818138966
121754.57076513162
120077.54867072447
118424.36797688356
116794.68965298217
115188.17948950474
113604.50802948956
112043.350500946
110504.38675023295
108987.3011763845
107491.78266636965
106017.5245312725
104564.2244433804
103131.58437416605
101719.3105331521
100327.11330764464
98954.70720332424
97601.81078568133
96268.14662228449
94953.44122586994
93657.42499823995
92379.83217495945
91120.40077083865
89878.87252619112
88654.9928538561

3963.2826568586042
3958.7062490211465
3954.1889307958645
3949.729862700028
3945.3282171885307
3940.983178484037
3936.6939424097063
3932.4597162241826
3928.279718459042
3924.153178758453
3920.079337721143
3916.0574467446395
3912.086767871705
3908.166573638954
3904.2961469276133
3900.4747808165102
3896.701778436874
3892.9764528295386
3889.298126804021
3885.6661327995503
3882.079812748163
3878.5385179396576
3875.041608888601
3871.5884552031316
3868.178435455618
3864.8109370551606
3861.4853561220284
3858.201097363691
3854.9575739527877
3851.754207406635
3848.590427468671
3845.465671991412
3842.3793868212033
3839.331025684461
3836.320050075676
3833.3459291469335
3830.4081395990443
3827.50616557412
3824.639498549891
3821.8076372352643
3819.0100874675195
3816.2463621110005
3813.5159809571182
3810.8184706259094
3808.153364468908
3805.5202024733962
3802.9185311680767
3800.3479035299924
3797.807878892932
3795.2980228569177
3792.8179071992336
[[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [1.]]
[[1.]
 [0.]


0.49077603863001

In [65]:
sum(predictions)

array([73727.])

In [66]:
sum(Y_test)

array([10840.])