## DDA3020 Homework 2
### Instructions:
- Follow the notebook and complete the code cells marked as TODO
- Ensure your code runs successfully until the end of the notebook

In [1]:
from os import path as osp
import numpy as np

# load data
def load_data():

    data_dir = './data'
    train_val_data_path = osp.join(data_dir, 'train_validation_data.npy')
    train_val_label_path = osp.join(data_dir, 'train_validation_label.npy')
    test_data_path = osp.join(data_dir, 'test_data.npy')
    test_label_path = osp.join(data_dir, 'test_label.npy')

    train_val_data = np.load(train_val_data_path)
    train_val_label = np.load(train_val_label_path)
    test_data = np.load(test_data_path)
    test_label = np.load(test_label_path)
    return train_val_data, train_val_label, test_data, test_label


train_validation_data, train_validation_label, test_data, test_label = load_data()

print(f'# ========== data info ============ #')
print(f'train validation data: {train_validation_data.shape}')
print(f'train validation label: {train_validation_label.shape}')
print(f'test data: {test_data.shape}')
print(f'test label: {test_label.shape}')
print(f'# ================================= #')

train validation data: (1000, 100)
train validation label: (1000,)
test data: (400, 100)
test label: (400,)


In [2]:
# data split for K-fold Cross-validation

def train_validation_split(K, train_val_data, train_val_label):

    # TODO: ==========================
    class_0_indices = []
    class_1_indices = []

    for i, label in enumerate(train_val_label):
        if label == 0:
            class_0_indices.append(i) 
        elif label == 1:
            class_1_indices.append(i)
    
    data_0 = train_val_data[class_0_indices]
    data_1 = train_val_data[class_1_indices]

    val_num = int(len(data_0) / K)
    train_num = len(data_0) - val_num

    train_datas = []
    train_labels = []
    val_datas = []
    val_labels = []

    for i in range(K):
        val_data = None
        val_label = None
        train_data = None
        train_label = None
  
        val_data = np.concatenate((data_0[i * val_num: (i + 1) * val_num], data_1[i * val_num: (i + 1) * val_num]))
        val_label = np.concatenate((np.zeros(val_num), np.ones(val_num)))

        if i == 0:
            train_data = np.concatenate((data_0[(i + 1) * val_num:], data_1[(i + 1) * val_num:]))
        elif i + 1 == K:
            train_data = np.concatenate((data_0[:i * val_num], data_1[:i * val_num]))
        else:
            data_0_train = np.concatenate((data_0[: i * val_num], data_0[(i + 1) * val_num:]))
            data_1_train = np.concatenate((data_1[: i * val_num], data_1[(i + 1) * val_num:]))
            train_data = np.concatenate((data_0_train, data_1_train))
        train_label = np.concatenate((np.zeros(train_num), np.ones(train_num)))

        train_datas.append(train_data)
        train_labels.append(train_label)
        val_datas.append(val_data)
        val_labels.append(val_label)

    return train_datas, train_labels, val_datas, val_labels

In [3]:
# evaluation metrics

def eva_precision(true_label, pred_label, _class):
    
    # TODO: ==========================
    assert len(true_label) == len(pred_label)
    indices = np.where(pred_label == _class)[0]
    tp = len(np.where(true_label[indices] == _class)[0])
    precision = tp / len(indices)
    return precision

def eva_recall(true_label, pred_label, _class):

    # TODO: ==========================
    assert len(true_label) == len(pred_label)
    indices = np.where(true_label == _class)[0]
    tp = len(np.where(pred_label[indices] == _class)[0])
    recall = tp / len(indices)
    return recall

def eva_f1(true_label, pred_label, _class):

    # TODO: ==========================
    assert len(true_label) == len(pred_label)
    p = eva_precision(true_label, pred_label, _class)
    r = eva_recall(true_label, pred_label, _class)
    assert p + r > 0
    f1 =  (2 * p * r) / (p + r) 
    return f1

def eva_accuracy(true_label, pred_label):

    # TODO: ==========================
    assert len(true_label) == len(pred_label)
    tp_tn = len(np.where(true_label == pred_label)[0])
    accuracy = tp_tn / len(true_label)

    return accuracy

def eva_auroc(true_label, pred_label):

    # TODO: ==========================
    assert len(true_label) == len(pred_label)

    indices_0 = []
    indices_1 = []
    for i, label in enumerate(true_label):
        if label == 0:
            indices_0.append(i)
        elif label == 1:
            indices_1.append(i)
    score = 0
    for i in indices_0:
        for j in indices_1:
            if pred_label[i] < pred_label[j]:
                score += 1
            elif pred_label[i] == pred_label[j]:
                score += 0.5
    auroc = score / (len(indices_0) * len(indices_1))
    return auroc

def evaluation(true_label, pred_label, _class):

    precision = eva_precision(true_label, pred_label, _class)
    recall = eva_recall(true_label, pred_label, _class)
    f1 = eva_f1(true_label, pred_label, _class)
    accuracy = eva_accuracy(true_label, pred_label)
    auroc = eva_auroc(true_label, pred_label)

    return {'precision': precision, 'recall': recall, 'f1': f1, 'accuracy': accuracy, 'auroc': auroc}
    


In [24]:
# model training and hyper-parameters fine-tuning
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

K = 5

# hyper-parameter for logistic regression
hyper_parameters_logistic_regression = {

    # TODO: please choose different values to tune the model
    'penalty': 'l2', # ['l1', 'l2'] 'l1', 'l2', 'l2', 'l1', 'l1'
}

# hyper-parameter for SVM
hyper_parameters_svm = {

    # TODO: please choose different values to tune the model
    'C': 1, # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]

}

# obtain cross-validation set
train_datas, train_labels, validation_datas, validation_labels = train_validation_split(K, train_validation_data, train_validation_label)


for i, (train_data, train_label, validation_data, validation_label) in enumerate(zip(train_datas, train_labels, validation_datas, validation_labels)):

    print(f'# ======================= {i + 1}-th time validation ======================= #')

    # logistic regression

    print(f'Algorithm: [logistic regression] =========================')
    print(f'hyper-parameter: {hyper_parameters_logistic_regression}')
    print(f'train data:{train_data.shape}')
    print(f'val data: {validation_data.shape}')
    lr_model = LogisticRegression(solver='liblinear', **hyper_parameters_logistic_regression).fit(train_data, train_label)

  
    # performance evaluation on validation set for tuning hyper-parameters
    pred_label = lr_model.predict(validation_data)

    
    F1_0 = eva_f1(validation_label, pred_label, _class=0)
    print(f'F1 (Val set of Class-0): {F1_0:.4f}')
    F1_1 = eva_f1(validation_label, pred_label, _class=1)
    print(f'F1 (Val set of Class-1): {F1_1:.4f}')
    print(f'Avg F1: {(F1_0 + F1_1) / 2:.4f}')

    # SVM

    print(f'Algorithm: [SVM] =========================================')
    print(f'hyper-parameter: {hyper_parameters_svm}')
    print(f'train data:{train_data.shape}')
    print(f'val data: {validation_data.shape}')
    svm_model = SVC(kernel='linear', **hyper_parameters_svm).fit(train_data, train_label)

    # performance evaluation on validation set for tuning hyper-parameters
    pred_label = svm_model.predict(validation_data)

    F1_0 = eva_f1(validation_label, pred_label, _class=0)
    print(f'F1 (Val set of Class-0): {F1_0:.4f}')
    F1_1 = eva_f1(validation_label, pred_label, _class=1)
    print(f'F1 (Val set of Class-1): {F1_1:.4f}')
    print(f'Avg F1: {(F1_0 + F1_1) / 2:.4f}')


hyper-parameter: {'penalty': 'l2'}
train data:(800, 100)
val data: (200, 100)
F1 (Val set of Class-0): 0.9223
F1 (Val set of Class-1): 0.9175
Avg F1: 0.9199
hyper-parameter: {'C': 1}
train data:(800, 100)
val data: (200, 100)
F1 (Val set of Class-0): 0.9154
F1 (Val set of Class-1): 0.9146
Avg F1: 0.9150
hyper-parameter: {'penalty': 'l2'}
train data:(800, 100)
val data: (200, 100)
F1 (Val set of Class-0): 0.9146
F1 (Val set of Class-1): 0.9154
Avg F1: 0.9150
hyper-parameter: {'C': 1}
train data:(800, 100)
val data: (200, 100)
F1 (Val set of Class-0): 0.8677
F1 (Val set of Class-1): 0.8815
Avg F1: 0.8746
hyper-parameter: {'penalty': 'l2'}
train data:(800, 100)
val data: (200, 100)
F1 (Val set of Class-0): 0.9490
F1 (Val set of Class-1): 0.9510
Avg F1: 0.9500
hyper-parameter: {'C': 1}
train data:(800, 100)
val data: (200, 100)
F1 (Val set of Class-0): 0.9246
F1 (Val set of Class-1): 0.9254
Avg F1: 0.9250
hyper-parameter: {'penalty': 'l2'}
train data:(800, 100)
val data: (200, 100)
F1 (Val

In [None]:
# performance evaluation on test set

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

K = 5

# hyper-parameter penlty for logistic regression. Hint: len(penalty) = 5
penalty = [
    'l1', 'l2', 'l2', 'l1', 'l1'
    # TODO: the optimal parameter selection for each split
]


# hyper-parameter C for SVM. Hint: len(C) = 5
C = [
    1e-4, 1e-5, 1e-4, 1e-3, 1e-3
    # TODO: the optimal parameter selection for each split

]

    
# obtain training data
train_datas, train_labels, _, _ = train_validation_split(K, train_validation_data, train_validation_label)


for i, (train_data, train_label) in enumerate(zip(train_datas, train_labels)):

    print(f'# ======================= {i + 1}-th time validation ======================= #')

    # logistic regression

    print(f'Algorithm: [logistic regression] =========================')
    lr_model = LogisticRegression(solver='liblinear', penalty=penalty[i]).fit(train_data, train_label)


    # performance evaluation on test set
    pred_label = lr_model.predict(test_data)

    results_0 = evaluation(test_label, pred_label, _class=0)
    results_1 = evaluation(test_label, pred_label, _class=1)
    print(f'Result Class 0 (Test set): {results_0}')
    print(f'Result Class 1 (Test set): {results_1}')

    # SVM

    print(f'Algorithm: [SVM] =========================================')
    svm_model = SVC(kernel='linear', C=C[i]).fit(train_data, train_label)

    # performance evaluation on test set
    pred_label = svm_model.predict(test_data)

    results_0 = evaluation(test_label, pred_label, _class=0)
    results_1 = evaluation(test_label, pred_label, _class=1)
    print(f'Result Class 0 (Test set): {results_0}')
    print(f'Result Class 1 (Test set): {results_1}')


hyper-parameter: {penalty: l1}
Result Class 0 (Test set): {'precision': 0.914572864321608, 'recall': 0.91, 'f1': 0.912280701754386, 'accuracy': 0.9125, 'auroc': 0.9125}
Result Class 1 (Test set): {'precision': 0.9104477611940298, 'recall': 0.915, 'f1': 0.912718204488778, 'accuracy': 0.9125, 'auroc': 0.9125}
hyper-parameter: {C: 0.0001}
Result Class 0 (Test set): {'precision': 0.93, 'recall': 0.93, 'f1': 0.93, 'accuracy': 0.93, 'auroc': 0.93}
Result Class 1 (Test set): {'precision': 0.93, 'recall': 0.93, 'f1': 0.93, 'accuracy': 0.93, 'auroc': 0.93}
hyper-parameter: {penalty: l2}
Result Class 0 (Test set): {'precision': 0.8942307692307693, 'recall': 0.93, 'f1': 0.911764705882353, 'accuracy': 0.91, 'auroc': 0.91}
Result Class 1 (Test set): {'precision': 0.9270833333333334, 'recall': 0.89, 'f1': 0.9081632653061226, 'accuracy': 0.91, 'auroc': 0.91}
hyper-parameter: {C: 1e-05}
Result Class 0 (Test set): {'precision': 0.9540816326530612, 'recall': 0.935, 'f1': 0.9444444444444445, 'accuracy': 