In [53]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import os
import csv

In [54]:
train = pd.read_csv('data//roberta//roberta.train.csv')
train_x = train.iloc[:,:-1]
train_y = train.iloc[:,-1:]

train_tfidf = pd.read_csv('data//tfidf//tfidf.train.csv')
train_x_tfidf = train_tfidf.iloc[:,:-1]
train_y_tfidf = train_tfidf.iloc[:,-1:]

test = pd.read_csv('data//roberta//roberta.test.csv')
test_x = test.iloc[:,:-1]
test_y = test.iloc[:,-1:]

test_tfidf = pd.read_csv('data//tfidf//tfidf.test.csv')
test_x_tfidf = test_tfidf.iloc[:,:-1]
test_y_tfidf = test_tfidf.iloc[:,-1:]

eval_df = pd.read_csv('data//roberta//roberta.eval.anon.csv')
eval_df = eval_df.iloc[:,:-1]

eval_df_tfidf = pd.read_csv('data//tfidf//tfidf.eval.anon.csv')
eval_df_tfidf = eval_df_tfidf.iloc[:,:-1]

In [55]:
train_y = train_y.replace(0, -1)
train_y_tfidf = train_y_tfidf.replace(0, -1)
test_y = test_y.replace(0, -1)
test_y_tfidf = test_y_tfidf.replace(0, -1)

In [56]:
for i in range(train_x.shape[1]):
    col = train_x.columns[i]
    train_x.rename(columns = {col:'roberta_'+col}, inplace = True)

for i in range(train_x_tfidf.shape[1]):
    col = train_x_tfidf.columns[i]
    train_x_tfidf.rename(columns = {col:'tfidf_'+col}, inplace = True)

for i in range(test_x.shape[1]):
    col = test_x.columns[i]
    test_x.rename(columns = {col:'roberta_'+col}, inplace = True)

for i in range(test_x_tfidf.shape[1]):
    col = test_x_tfidf.columns[i]
    test_x_tfidf.rename(columns = {col:'tfidf_'+col}, inplace = True)

for i in range(eval_df.shape[1]):
    col = eval_df.columns[i]
    eval_df.rename(columns = {col:'roberta_'+col}, inplace = True)

for i in range(eval_df_tfidf.shape[1]):
    col = eval_df_tfidf.columns[i]
    eval_df_tfidf.rename(columns = {col:'tfidf_'+col}, inplace = True)

col = train_x_tfidf[train_x_tfidf.columns]
train_x = train_x.join(col)

col = test_x_tfidf[test_x_tfidf.columns]
test_x = test_x.join(col)

col = eval_df_tfidf[eval_df_tfidf.columns]
eval_df = eval_df.join(col)

In [57]:
def prediction(x, y, w, b):
    pred = np.dot(w.transpose(), x) + b
    if (pred > 0 and y == 1) or (pred < 0 and y == -1):
        return True
    else:
        return False

def accuracy(feature_df, label_df, w, b):
    size = feature_df.shape[0]
    count = 0
    for i in range(size):
        x = feature_df.iloc[i].tolist()
        y = label_df.iloc[i].tolist()[0]
        if prediction(x, y, w, b):
            count += 1
    return (count/size)

def prediction_value(x, w, b):
    pred = np.dot(w.transpose(), x) + b
    if pred > 0:
        return 1
    else:
        return 0

def get_predicted_list(feature_df, w, b):
    pred_list = []
    size = feature_df.shape[0]
    count = 0
    for i in range(size):
        x = feature_df.iloc[i].tolist()
        pred_list.append(prediction_value(x, w, b))
    return pred_list

def generate_w_b(size):
    np.random.seed(65)
    w = np.array(np.random.normal(-0.01, 0.01, size))
    b = np.random.normal(-0.01, 0.01)
    w_a = w
    b_a = b
    return w, b, w_a, b_a

def update_w_b(x, y, w, b, w_a, b_a, lr, update_count):
    eq = y*(np.dot(w.transpose(), x) + b)
    x = np.array(x)
    if eq < 0:
        w = w + lr*y*x
        b = b + lr*y
        update_count += 1
    w_a = w_a + w
    b_a = b_a + b
    return w, b, w_a, b_a, update_count

def perceptron(features, labels, w, b, w_a, b_a, lr, epochs, dev=False, test_df_x=None, test_df_y=None):
    w_list = []
    b_list = []
    update_count = 0
    accuracies_list = []
    index_list = np.arange(features.shape[0])
    for e in range(epochs):
        print("epoch:", e)
        np.random.seed(e)
        np.random.shuffle(index_list)
        for i in index_list:
            x = features.iloc[i].tolist()
            y = labels.iloc[i].tolist()[0]
            w, b, w_a, b_a, update_count = update_w_b(x, y, w, b, w_a, b_a, lr, update_count)
        if dev:
            b_list.append(b_a)
            w_list.append(w_a.copy())
            acc = accuracy(test_df_x, test_df_y, w_a, b_a)
            accuracies_list.append(acc)
            print("Developmental dataset accuracy for epoch", e, "=", acc)
    if dev:
        return w, b, w_a, b_a, accuracies_list, update_count, w_list, b_list
    else:
        return w, b, w_a, b_a, update_count

In [58]:
#CROSS VALIDATION
k = 5
epochs = 20
lrs = [1, 0.1, 0.01]

train_length = int(train_x.shape[0]*4//5)
test_length = train_x.shape[0] - train_length

train_x_fold = train_x.head(train_length)
train_y_fold = train_y.head(train_length)

test_x_fold = train_x.tail(test_length)
test_y_fold = train_y.tail(test_length)

accuracies = {}

for lr in lrs:
    print("lr:", lr)
    w, b, w_a, b_a = generate_w_b(train_x.shape[1])
    w, b, w_a, b_a, update_count = perceptron(train_x_fold, train_y_fold, w, b, w_a, b_a, lr, epochs)
    acc = accuracy(test_x_fold, test_y_fold, w_a, b_a)
    accuracies[lr] = acc
print(accuracies)

lr: 1
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
lr: 0.1
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
lr: 0.01
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
{1: 0.8272214386459803, 0.1: 0.8265162200282088, 0.01: 0.8300423131170663}


In [59]:
best_lr = max(accuracies, key = lambda x: accuracies[x])
lr = best_lr
print("Best LR:", lr)
epochs = 20

w, b, w_a, b_a = generate_w_b(train_x.shape[1])
w, b, w_a, b_a, accuracies_list, update_count, w_list, b_list = perceptron(train_x, train_y, w, b, w_a, b_a, lr, epochs, True, test_x_fold, test_y_fold)

print()
print("Total number of updates on training set:", update_count)
print("Max accuracy in developmental dataset:", max(accuracies_list))
index = accuracies_list.index(max(accuracies_list))
print("Max accuracy in developmental dataset was found at epoch number:", index)
best_w = w_list[index]
best_b = b_list[index]
print("Train accuracy with w and b corresponding to best accuracy for dev dataset =", accuracy(train_x, train_y, best_w, best_b))
print("Test accuracy with w and b corresponding to best accuracy for dev dataset =", accuracy(test_x, test_y, best_w, best_b))
print()

Best LR: 0.01
epoch: 0
Developmental dataset accuracy for epoch 0 = 0.5437235543018336
epoch: 1
Developmental dataset accuracy for epoch 1 = 0.8067700987306065
epoch: 2
Developmental dataset accuracy for epoch 2 = 0.8166431593794076
epoch: 3
Developmental dataset accuracy for epoch 3 = 0.8201692524682651
epoch: 4
Developmental dataset accuracy for epoch 4 = 0.8229901269393513
epoch: 5
Developmental dataset accuracy for epoch 5 = 0.8279266572637518
epoch: 6
Developmental dataset accuracy for epoch 6 = 0.8321579689703809
epoch: 7
Developmental dataset accuracy for epoch 7 = 0.8363892806770099
epoch: 8
Developmental dataset accuracy for epoch 8 = 0.840620592383639
epoch: 9
Developmental dataset accuracy for epoch 9 = 0.842031029619182
epoch: 10
Developmental dataset accuracy for epoch 10 = 0.843441466854725
epoch: 11
Developmental dataset accuracy for epoch 11 = 0.847672778561354
epoch: 12
Developmental dataset accuracy for epoch 12 = 0.8519040902679831
epoch: 13
Developmental dataset acc

In [60]:
pred = get_predicted_list(eval_df, best_w, best_b)

In [61]:
file = open('1_avgPerceptron_roberta_tfidf_65s_20e.csv', 'w', newline ='')

with file:
    header = ['example_id', 'label']
    writer = csv.DictWriter(file, fieldnames = header)

    writer.writeheader()
    for i in range(len(pred)):
        writer.writerow({'example_id' : i, 'label': pred[i]})