In [34]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import os
import csv
from math import exp

In [35]:
train = pd.read_csv('data//roberta//roberta.train.csv')
train_x = train.iloc[:,:-1]
train_y = train.iloc[:,-1:]

train_tfidf = pd.read_csv('data//tfidf//tfidf.train.csv')
train_x_tfidf = train_tfidf.iloc[:,:-1]
train_y_tfidf = train_tfidf.iloc[:,-1:]

test = pd.read_csv('data//roberta//roberta.test.csv')
test_x = test.iloc[:,:-1]
test_y = test.iloc[:,-1:]

test_tfidf = pd.read_csv('data//tfidf//tfidf.test.csv')
test_x_tfidf = test_tfidf.iloc[:,:-1]
test_y_tfidf = test_tfidf.iloc[:,-1:]

eval_df = pd.read_csv('data//roberta//roberta.eval.anon.csv')
eval_df = eval_df.iloc[:,:-1]

eval_df_tfidf = pd.read_csv('data//tfidf//tfidf.eval.anon.csv')
eval_df_tfidf = eval_df_tfidf.iloc[:,:-1]

In [36]:
train_y = train_y.replace(0, -1)
train_y_tfidf = train_y_tfidf.replace(0, -1)
test_y = test_y.replace(0, -1)
test_y_tfidf = test_y_tfidf.replace(0, -1)

In [37]:
for i in range(train_x.shape[1]):
    col = train_x.columns[i]
    train_x.rename(columns = {col:'roberta_'+col}, inplace = True)

for i in range(train_x_tfidf.shape[1]):
    col = train_x_tfidf.columns[i]
    train_x_tfidf.rename(columns = {col:'tfidf_'+col}, inplace = True)

for i in range(test_x.shape[1]):
    col = test_x.columns[i]
    test_x.rename(columns = {col:'roberta_'+col}, inplace = True)

for i in range(test_x_tfidf.shape[1]):
    col = test_x_tfidf.columns[i]
    test_x_tfidf.rename(columns = {col:'tfidf_'+col}, inplace = True)

for i in range(eval_df.shape[1]):
    col = eval_df.columns[i]
    eval_df.rename(columns = {col:'roberta_'+col}, inplace = True)

for i in range(eval_df_tfidf.shape[1]):
    col = eval_df_tfidf.columns[i]
    eval_df_tfidf.rename(columns = {col:'tfidf_'+col}, inplace = True)

col = train_x_tfidf[train_x_tfidf.columns]
train_x = train_x.join(col)

col = test_x_tfidf[test_x_tfidf.columns]
test_x = test_x.join(col)

col = eval_df_tfidf[eval_df_tfidf.columns]
eval_df = eval_df.join(col)

In [38]:
def prediction(x, y, w, b):
    pred = np.dot(w.transpose(), x) + b
    if (pred > 0 and y == 1) or (pred < 0 and y == -1):
        return True
    else:
        return False

def accuracy(feature_df, label_df, w, b, flag=False):
    size = feature_df.shape[0]
    count = 0
    pred_list = []
    for i in range(size):
        x = feature_df.iloc[i].tolist()
        y = label_df.iloc[i].tolist()[0]
        predicted_y = prediction(x, y, w, b)
        if flag:
            pred_list.append(predicted_y)
        if predicted_y:
            count += 1
    if flag:
        return (count/size), pred_list
    else:
        return (count/size)

def prediction_value(x, w, b):
    pred = np.dot(w.transpose(), x) + b
    if pred > 0:
        return 1
    else:
        return 0

def get_predicted_list(feature_df, w, b):
    pred_list = []
    size = feature_df.shape[0]
    count = 0
    for i in range(size):
        x = feature_df.iloc[i].tolist()
        pred_list.append(prediction_value(x, w, b))
    return pred_list

def generate_w_b(size):
    np.random.seed(65)
    w = np.array(np.random.normal(-0.01, 0.01, size))
    b = np.random.normal(-0.01, 0.01)
    return w, b

def update_w_b(x, y, w, b, lr):
    eq = y*(np.dot(w.transpose(), x) + b)
    x = np.array(x)
    if eq < 0:
        w = w + lr*y*x
        b = b + lr*y
    return w, b

def calculate_error(pred_list, D):
    error = 0
    for i in range(len(pred_list)):
        if pred_list[i] == False:
            error += D[i]
    return error

def calculate_alpha(error):
    return 0.5 * np.log((1 - error) / error)

def update_D(D, pred_list, alpha):
    for i in range(len(D)):
        if pred_list[i]:
            D[i] = D[i]*exp(-1*alpha)
        else:
            D[i] = D[i]*exp(alpha)
    Z = np.sum(D)
    D = [D[i]/Z for i in range(len(D))]
    return D

def perceptron(features, labels, w, b, lr, epochs, dev=False, test_df_x=None, test_df_y=None):
    w_list = []
    b_list = []
    accuracies_list = []
    index_list = np.arange(features.shape[0])
    for e in range(epochs):
        print("epoch:", e)
        np.random.seed(e)
        np.random.shuffle(index_list)
        for i in index_list:
            x = features.iloc[i].tolist()
            y = labels.iloc[i].tolist()[0]
            w, b = update_w_b(x, y, w, b, lr)
        if dev:
            b_list.append(b)
            w_list.append(w.copy())
            acc = accuracy(test_df_x, test_df_y, w, b)
            accuracies_list.append(acc)
            print("Developmental dataset accuracy for epoch", e, "=", acc)
    if dev:
        return w, b, accuracies_list, w_list, b_list
    else:
        return w, b

In [39]:
def boosting(train_x, train_y, lr, epochs):
    alphas = []
    weights = []
    bias = []
    N = train_x.shape[0]
    D = np.ones(N) / N
    
    
    for i in range(20):
        print("Round: ", i+1)
        w, b = generate_w_b(train_x.shape[1])
        w, b = perceptron(train_x, train_y, w, b, lr, epochs)
        acc, pred_list = accuracy(train_x, train_y, w, b, True)
        
        error = calculate_error(pred_list, D)
        alpha = calculate_alpha(error)
        D = update_D(D, pred_list, alpha)
        
        bias.append(b)
        weights.append(w)
        alphas.append(alpha)
        
    return alphas, weights, bias


In [40]:
def boosting_predict(features, alphas, weights, bias):
    pred_list = []
    for i in range(features.shape[0]):
        total = 0
        x = features.iloc[i]
        for j in range(len(alphas)):
            a = alphas[j]
            w = weights[j]
            b = bias[j]
            
            total += a * (np.dot(w.transpose(), x) + b)
        if total < 0:
            pred = -1
        else:
            pred = 1
        pred_list.append(pred)
    return pred_list

def boosting_accuracy(pred_list, labels):
    count = 0
    total = len(pred_list)
    for i in range(total):
        if pred_list[i] == labels.iloc[i].tolist()[0]:
            count+=1
    return count/total

In [41]:
train_length = int(train_x.shape[0]*4//5)
test_length = train_x.shape[0] - train_length

train_x_fold = train_x.head(train_length)
train_y_fold = train_y.head(train_length)

test_x_fold = train_x.tail(test_length)
test_y_fold = train_y.tail(test_length)

lrs = [1, 0.1, 0.01]
accuracies = {}

for lr in lrs:
    print("lr:", lr)
    alphas, weights, bias = boosting(train_x_fold, train_y_fold, lr, 20)
    pred_list = boosting_predict(test_x_fold, alphas, weights, bias)
    acc = boosting_accuracy(pred_list, test_y_fold)
    accuracies[lr] = acc
print(accuracies)

lr: 1
Round:  1
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  2
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  3
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  4
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  5
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoc

epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  2
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  3
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  4
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  5
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  6
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14


In [42]:
best_lr = max(accuracies, key = lambda x: accuracies[x])
lr = best_lr
print("best_lr:",lr)

alphas, weights, bias = boosting(train_x, train_y, lr, 20)
pred_list = boosting_predict(train_x, alphas, weights, bias)
acc = boosting_accuracy(pred_list, train_y)
print("Train accuracy:", acc)

pred_list = boosting_predict(test_x, alphas, weights, bias)
acc = boosting_accuracy(pred_list, test_y)
print("Test accuracy:", acc)

best_lr: 1
Round:  1
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  2
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  3
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  4
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
Round:  5
epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18

In [43]:
pred_list = boosting_predict(eval_df, alphas, weights, bias)
file = open('6_boosted_avgPerceptron_roberta_tfidf_65s_20e_20h.csv', 'w', newline ='')

with file:
    header = ['example_id', 'label']
    writer = csv.DictWriter(file, fieldnames = header)

    writer.writeheader()
    for i in range(len(pred_list)):
        if pred_list[i] == -1:
            pred_list[i] = 0
        writer.writerow({'example_id' : i, 'label': pred_list[i]})