In [20]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import os
import csv

In [21]:
# ROBERTA
train = pd.read_csv('data//roberta//roberta.train.csv')
train_x = train.iloc[:,:-1]
train_y = train.iloc[:,-1:]

test = pd.read_csv('data//roberta//roberta.test.csv')
test_x = test.iloc[:,:-1]
test_y = test.iloc[:,-1:]

eval_df = pd.read_csv('data//roberta//roberta.eval.anon.csv')
eval_df = eval_df.iloc[:,:-1]

#tfidf
train_tfidf = pd.read_csv('data//tfidf//tfidf.train.csv')
train_x_tfidf = train_tfidf.iloc[:,:-1]
train_y_tfidf = train_tfidf.iloc[:,-1:]

test_tfidf = pd.read_csv('data//tfidf//tfidf.test.csv')
test_x_tfidf = test_tfidf.iloc[:,:-1]
test_y_tfidf = test_tfidf.iloc[:,-1:]

eval_df_tfidf = pd.read_csv('data//tfidf//tfidf.eval.anon.csv')
eval_df_tfidf = eval_df_tfidf.iloc[:,:-1]

#SPACY
train_spacy = pd.read_csv('data//spacy-embeddings//spacy-embeddings.train.csv')
train_x_spacy = train_spacy.iloc[:,:-1]
train_y_spacy = train_spacy.iloc[:,-1:]

test_spacy = pd.read_csv('data//spacy-embeddings//spacy-embeddings.test.csv')
test_x_spacy = test_spacy.iloc[:,:-1]
test_y_spacy = test_spacy.iloc[:,-1:]

eval_df_spacy = pd.read_csv('data//spacy-embeddings//spacy-embeddings.eval.anon.csv')
eval_df_spacy = eval_df_spacy.iloc[:,:-1]

In [22]:
train_y = train_y.replace(0, -1)
train_y_tfidf = train_y_tfidf.replace(0, -1)
train_y_spacy = train_y_spacy.replace(0, -1)
test_y = test_y.replace(0, -1)
test_y_tfidf = test_y_tfidf.replace(0, -1)
test_y_spacy = test_y_spacy.replace(0, -1)

In [23]:
# tfidf
for i in range(train_x_tfidf.shape[1]):
    col = train_x_tfidf.columns[i]
    train_x_tfidf.rename(columns = {col:'tfidf_'+col}, inplace = True)

for i in range(test_x_tfidf.shape[1]):
    col = test_x_tfidf.columns[i]
    test_x_tfidf.rename(columns = {col:'tfidf_'+col}, inplace = True)

for i in range(eval_df_tfidf.shape[1]):
    col = eval_df_tfidf.columns[i]
    eval_df_tfidf.rename(columns = {col:'tfidf_'+col}, inplace = True)

# SPACY
for i in range(train_x_spacy.shape[1]):
    col = train_x_spacy.columns[i]
    train_x_spacy.rename(columns = {col:'spacy_'+col}, inplace = True)

for i in range(test_x_spacy.shape[1]):
    col = test_x_spacy.columns[i]
    test_x_spacy.rename(columns = {col:'spacy_'+col}, inplace = True)

for i in range(eval_df_spacy.shape[1]):
    col = eval_df_spacy.columns[i]
    eval_df_spacy.rename(columns = {col:'spacy_'+col}, inplace = True)

#MERGE
col = train_x_spacy[train_x_spacy.columns]
train_x = train_x_tfidf.join(col)

col = test_x_spacy[test_x_spacy.columns]
test_x = test_x_tfidf.join(col)

col = eval_df_spacy[eval_df_spacy.columns]
eval_df = eval_df_tfidf.join(col)

In [24]:
def prediction(x, y, w):
    pred = np.dot(w.transpose(), x)
    if (pred > 0 and y == 1) or (pred <= 0 and y == -1):
        return True
    else:
        return False

def accuracy(feature_df, label_df, w):
    size = feature_df.shape[0]
    count = 0
    for i in range(size):
        x = feature_df.iloc[i].tolist()
        y = label_df.iloc[i].tolist()[0]
        if prediction(x, y, w):
            count += 1
    return (count/size)

def prediction_value(x, w):
    pred = np.dot(w.transpose(), x)
    if pred > 0:
        return 1
    else:
        return 0

def get_predicted_list(feature_df, w):
    pred_list = []
    size = feature_df.shape[0]
    count = 0
    for i in range(size):
        x = feature_df.iloc[i].tolist()
        pred_list.append(prediction_value(x, w))
    return pred_list

def generate_w(size):
    np.random.seed(65)
    w = np.array(np.random.normal(-0.01, 0.01, size))
    return w

def update_w(x, y, w, lr, c, update_count):
    eq = y*(np.dot(w.transpose(), x))
    if eq <= 1:
        w = w * (1 - lr) + (lr * c * y * x)
    else:
        w = w * (1 - lr)
    update_count += 1
    return w, update_count

def perceptron(features, labels, w, lr, epochs, c=0, dev=False, test_df_x=None, test_df_y=None):
    w_list = []
    update_count = 0
    accuracies_list = []
    index_list = np.arange(features.shape[0])
    for e in range(epochs):
        print("Epoch:", e)
        new_lr = (lr/(1+e))
        np.random.seed(e)
        np.random.shuffle(index_list)
        for i in index_list:
            x = features.iloc[i]
            y = labels.iloc[i].tolist()[0]
            w, update_count = update_w(x, y, w, new_lr, c, update_count)
        if dev:
            w_list.append(w.copy())
            acc = accuracy(test_df_x, test_df_y, w)
            accuracies_list.append(acc)
            print("Developmental dataset accuracy for epoch", e, "=", acc)
    return w, accuracies_list, update_count, w_list

In [25]:
#CROSS VALIDATION

train_length = int(train_x.shape[0]*4//5)
test_length = train_x.shape[0] - train_length

train_x_fold = train_x.head(train_length)
train_y_fold = train_y.head(train_length)

test_x_fold = train_x.tail(test_length)
test_y_fold = train_y.tail(test_length)

k = 5
lrs = [1, 0.1, 0.01, 0.001, 0.0001]
tradeoff = [10, 1, 0.1, 0.01, 0.001]

epochs = 20
accuracies = {(lr, c): [] for lr in lrs for c in tradeoff}
for lr in lrs:
    for c in tradeoff:
        print("Lr:", lr, "C:", c)
        w = generate_w(train_x_fold.shape[1])
        w, accuracies_list, update_count, w_list = perceptron(train_x_fold, train_y_fold, w, lr, epochs, c)
        acc = accuracy(test_x_fold, test_y_fold, w)
        accuracies[lr, c] = acc


for i in lrs:
    for j in tradeoff:
        print("Averaged test accuracy for (learning rate, tradeoff):(",i,j,"):",accuracies[(i,j)])
        print()

best_lr_c = max(accuracies, key=lambda x: accuracies[x])
print("(LR, m) pair with highest accuracy:", best_lr_c)
print("Accuracy with the best learning rate, tradeoff pair:",accuracies[best_lr_c])

Lr: 1 C: 10
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Lr: 1 C: 1
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Lr: 1 C: 0.1
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Lr: 1 C: 0.01
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Lr: 1 C: 0.001
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch

In [26]:
lr = best_lr_c[0]
c = best_lr_c[1]
epochs = 20
w = generate_w(train_x.shape[1])
w, accuracies_list, update_count, w_list = perceptron(train_x, train_y, w, lr, epochs, c, True, test_x_fold, test_y_fold)

print()
print("Total number of updates on training set:", update_count)
print("Max accuracy in developmental dataset:", max(accuracies_list))
index = accuracies_list.index(max(accuracies_list))
print("Max accuracy in developmental dataset was found at epoch number:", index)
best_w = w_list[index]
# best_b = b_list[index]
print("Train accuracy with w and b corresponding to best accuracy for dev dataset =", accuracy(train_x, train_y, best_w))
print("Test accuracy with w and b corresponding to best accuracy for dev dataset =", accuracy(test_x, test_y, best_w))
print()

Epoch: 0
Developmental dataset accuracy for epoch 0 = 0.731311706629055
Epoch: 1
Developmental dataset accuracy for epoch 1 = 0.7362482369534555
Epoch: 2
Developmental dataset accuracy for epoch 2 = 0.7136812411847673
Epoch: 3
Developmental dataset accuracy for epoch 3 = 0.7052186177715092
Epoch: 4
Developmental dataset accuracy for epoch 4 = 0.7581100141043724
Epoch: 5
Developmental dataset accuracy for epoch 5 = 0.7566995768688294
Epoch: 6
Developmental dataset accuracy for epoch 6 = 0.735543018335684
Epoch: 7
Developmental dataset accuracy for epoch 7 = 0.7545839210155149
Epoch: 8
Developmental dataset accuracy for epoch 8 = 0.7531734837799718
Epoch: 9
Developmental dataset accuracy for epoch 9 = 0.7595204513399154
Epoch: 10
Developmental dataset accuracy for epoch 10 = 0.7440056417489421
Epoch: 11
Developmental dataset accuracy for epoch 11 = 0.7588152327221439
Epoch: 12
Developmental dataset accuracy for epoch 12 = 0.7447108603667136
Epoch: 13
Developmental dataset accuracy for ep

In [27]:
pred = get_predicted_list(eval_df, best_w)

In [28]:
file = open('5_svm-sgd-tfidf-spacy_65s_20e.csv', 'w', newline ='')

with file:
    header = ['example_id', 'label']
    writer = csv.DictWriter(file, fieldnames = header)

    writer.writeheader()
    for i in range(len(pred)):
        writer.writerow({'example_id' : i, 'label': pred[i]})