## DDA3020 Homework 2
### Instructions:
- Follow the notebook and complete the code cells marked as TODO
- Ensure your code runs successfully until the end of the notebook

In [3]:
from os import path as osp
import numpy as np

# load data
def load_data():

    data_dir = './data'
    train_val_data_path = osp.join(data_dir, 'train_validation_data.npy')
    train_val_label_path = osp.join(data_dir, 'train_validation_label.npy')
    test_data_path = osp.join(data_dir, 'test_data.npy')
    test_label_path = osp.join(data_dir, 'test_label.npy')

    train_val_data = np.load(train_val_data_path)
    train_val_label = np.load(train_val_label_path)
    test_data = np.load(test_data_path)
    test_label = np.load(test_label_path)
    return train_val_data, train_val_label, test_data, test_label


train_validation_data, train_validation_label, test_data, test_label = load_data()

print(f'# ========== data info ============ #')
print(f'train validation data: {train_validation_data.shape}')
print(f'train validation label: {train_validation_label.shape}')
print(f'test data: {test_data.shape}')
print(f'test label: {test_label.shape}')
print(f'# ================================= #')

train validation data: (1000, 100)
train validation label: (1000,)
test data: (400, 100)
test label: (400,)


In [4]:
# data split for K-fold Cross-validation

def train_validation_split(K, train_val_data, train_val_label):

    # TODO: ==========================

    # get indices for different class
    indices_class_0 = np.where(train_val_label == 0)[0]
    indices_class_1 = np.where(train_val_label == 1)[0]
    
    # fold size for class 0,1
    fold_size_class_0 = len(indices_class_0) // K
    fold_size_class_1 = len(indices_class_1) // K

    folds_class_0 = [indices_class_0[i * fold_size_class_0:(i + 1) * fold_size_class_0] for i in range(K)]
    folds_class_1 = [indices_class_1[i * fold_size_class_1:(i + 1) * fold_size_class_1] for i in range(K)]
    
    train_datas, train_labels, val_datas, val_labels = [], [], [], []
    
    for i in range(K):
        # validation set
        val_indices = np.concatenate([folds_class_0[i], folds_class_1[i]])
        val_datas.append(train_val_data[val_indices])
        val_labels.append(train_val_label[val_indices])

        # train set
        train_indices = np.concatenate([np.concatenate([folds_class_0[j] for j in range(K) if j != i]),
                                         np.concatenate([folds_class_1[j] for j in range(K) if j != i])])
        train_datas.append(train_val_data[train_indices])
        train_labels.append(train_val_label[train_indices])
    
    return train_datas, train_labels, val_datas, val_labels

In [5]:
# evaluation metrics

def eva_precision(true_label, pred_label, _class):
    
    # TODO: ==========================
    # precision = TP/(TP+FP)
    TP = 0
    FP = 0

    for true, pred in zip(true_label, pred_label):
        if pred == _class:
            if true == _class:
                TP += 1
            else:
                FP += 1

    if TP + FP == 0:
        return 0.0

    precision = TP / (TP + FP)

    return precision

def eva_recall(true_label, pred_label, _class):

    # TODO: ==========================
    # recall = TP/(TP+FN)
    TP = 0
    FN = 0

    for true, pred in zip(true_label, pred_label):
        if true == _class:
            if pred == _class:
                TP += 1
            else:
                FN += 1

    if TP + FN == 0:
        return 0.0

    recall = TP / (TP + FN)

    return recall

def eva_f1(true_label, pred_label, _class):

    # TODO: ==========================
    # f1 = 2*precision*recall/(precision + recall)
    
    P = eva_precision(true_label, pred_label, _class)
    R = eva_recall(true_label, pred_label, _class)

    if P == 0 or R == 0:
        return 0.0
        
    f1 = 2 * P * R / (P + R)

    return f1

def eva_accuracy(true_label, pred_label):

    # TODO: ==========================
    corr_pred = 0
    for true, pred in zip(true_label, pred_label):
        if true == pred:
            corr_pred += 1
    accuracy = corr_pred / len(true_label)

    return accuracy

def eva_auroc(true_label, pred_label):

    # TODO: ==========================
    # get indices
    pos_indices = [i for i, label in enumerate(true_label) if label == 1]
    neg_indices = [i for i, label in enumerate(true_label) if label == 0]
    
    # compute e_ij and nmumber of u(e_ij) > 0
    count = 0
    for i in pos_indices:
        for j in neg_indices:
            if pred_label[i] > pred_label[j]:
                count += 1
            elif pred_label[i] == pred_label[j]:
                count += 0.5
    
    # auc
    m_plus = len(pos_indices)
    m_minus = len(neg_indices)
    auroc = count / (m_plus * m_minus)
    
    return auroc
    
def evaluation(true_label, pred_label, _class):

    precision = eva_precision(true_label, pred_label, _class)
    recall = eva_recall(true_label, pred_label, _class)
    f1 = eva_f1(true_label, pred_label, _class)
    accuracy = eva_accuracy(true_label, pred_label)
    auroc = eva_auroc(true_label, pred_label)

    return {'precision': precision, 'recall': recall, 'f1': f1, 'accuracy': accuracy, 'auroc': auroc}

In [6]:
# model training and hyper-parameters fine-tuning
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

K = 5

# hyper-parameter for logistic regression
hyper_parameters_logistic_regression = {

    # TODO: please choose different values to tune the model
    'penalty': 'l2', # ['l1', 'l2']
}

# hyper-parameter for SVM
hyper_parameters_svm = {

    # TODO: please choose different values to tune the model
    'C': 1, # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]

}

# obtain cross-validation set
train_datas, train_labels, validation_datas, validation_labels = train_validation_split(K, train_validation_data, train_validation_label)


for i, (train_data, train_label, validation_data, validation_label) in enumerate(zip(train_datas, train_labels, validation_datas, validation_labels)):

    print(f'# ======================= {i + 1}-th time validation ======================= #')

    # logistic regression

    print(f'Algorithm: [logistic regression] =========================')
    print(f'hyper-parameter: {hyper_parameters_logistic_regression}')
    lr_model = LogisticRegression(solver='liblinear', **hyper_parameters_logistic_regression).fit(train_data, train_label)

    # performance evaluation on validation set for tuning hyper-parameters
    pred_label = lr_model.predict(validation_data)

    F1_0 = eva_f1(validation_label, pred_label, _class=0)
    print(f'F1 (Val set of Class-0): {F1_0:.4f}')
    F1_1 = eva_f1(validation_label, pred_label, _class=1)
    print(f'F1 (Val set of Class-1): {F1_1:.4f}')

    # SVM

    print(f'Algorithm: [SVM] =========================================')
    print(f'hyper-parameter: {hyper_parameters_svm}')
    svm_model = SVC(kernel='linear', **hyper_parameters_svm).fit(train_data, train_label)

    # performance evaluation on validation set for tuning hyper-parameters
    pred_label = svm_model.predict(validation_data)
    F1_0 = eva_f1(validation_label, pred_label, _class=0)
    print(f'F1 (Val set of Class-0): {F1_0:.4f}')
    F1_1 = eva_f1(validation_label, pred_label, _class=1)
    print(f'F1 (Val set of Class-1): {F1_1:.4f}')


hyper-parameter: {'penalty': 'l2'}
F1 (Val set of Class-0): 0.9223
F1 (Val set of Class-1): 0.9175
hyper-parameter: {'C': 1}
F1 (Val set of Class-0): 0.9154
F1 (Val set of Class-1): 0.9146
hyper-parameter: {'penalty': 'l2'}
F1 (Val set of Class-0): 0.9146
F1 (Val set of Class-1): 0.9154
hyper-parameter: {'C': 1}
F1 (Val set of Class-0): 0.8677
F1 (Val set of Class-1): 0.8815
hyper-parameter: {'penalty': 'l2'}
F1 (Val set of Class-0): 0.9490
F1 (Val set of Class-1): 0.9510
hyper-parameter: {'C': 1}
F1 (Val set of Class-0): 0.9246
F1 (Val set of Class-1): 0.9254
hyper-parameter: {'penalty': 'l2'}
F1 (Val set of Class-0): 0.9378
F1 (Val set of Class-1): 0.9319
hyper-parameter: {'C': 1}
F1 (Val set of Class-0): 0.9223
F1 (Val set of Class-1): 0.9175
hyper-parameter: {'penalty': 'l2'}
F1 (Val set of Class-0): 0.9282
F1 (Val set of Class-1): 0.9215
hyper-parameter: {'C': 1}
F1 (Val set of Class-0): 0.9423
F1 (Val set of Class-1): 0.9375


In [14]:
# performance evaluation on test set

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

K = 5

# hyper-parameter penlty for logistic regression. Hint: len(penalty) = 5
penalty = [
    # TODO: the optimal parameter selection for each split
    'l1', 'l2', 'l2', 'l1', 'l1'
]

# hyper-parameter C for SVM. Hint: len(C) = 5
C = [
    # TODO: the optimal parameter selection for each split
    1e-4, 1e-5, 1e-4, 1e-3, 1e-3
]

    
# obtain training data
train_datas, train_labels, _, _ = train_validation_split(K, train_validation_data, train_validation_label)


for i, (train_data, train_label) in enumerate(zip(train_datas, train_labels)):

    print(f'# ======================= {i + 1}-th time validation ======================= #')

    # logistic regression

    print(f'Algorithm: [logistic regression] =========================')
    print(f'hyper-parameter: {penalty[i]}')
    lr_model = LogisticRegression(solver='liblinear', penalty=penalty[i]).fit(train_data, train_label)


    # performance evaluation on test set
    pred_label = lr_model.predict(test_data)
    results_0 = evaluation(test_label, pred_label, _class=0)
    results_1 = evaluation(test_label, pred_label, _class=1)
    print(f'Result Class 0 (Test set): {results_0}')
    print(f'Result Class 1 (Test set): {results_1}')

    # SVM

    print(f'Algorithm: [SVM] =========================================')
    print(f'hyper-parameter: {C[i]}')
    svm_model = SVC(kernel='linear', C=C[i]).fit(train_data, train_label)

    # performance evaluation on test set
    pred_label = svm_model.predict(test_data)
    results_0 = evaluation(test_label, pred_label, _class=0)
    results_1 = evaluation(test_label, pred_label, _class=1)
    print(f'Result Class 0 (Test set): {results_0}')
    print(f'Result Class 1 (Test set): {results_1}')


hyper-parameter: l1
Result Class 0 (Test set): {'precision': 0.914572864321608, 'recall': 0.91, 'f1': 0.912280701754386, 'accuracy': 0.9125, 'auroc': 0.9125}
Result Class 1 (Test set): {'precision': 0.9104477611940298, 'recall': 0.915, 'f1': 0.912718204488778, 'accuracy': 0.9125, 'auroc': 0.9125}
hyper-parameter: 0.0001
Result Class 0 (Test set): {'precision': 0.93, 'recall': 0.93, 'f1': 0.93, 'accuracy': 0.93, 'auroc': 0.93}
Result Class 1 (Test set): {'precision': 0.93, 'recall': 0.93, 'f1': 0.93, 'accuracy': 0.93, 'auroc': 0.93}
hyper-parameter: l2
Result Class 0 (Test set): {'precision': 0.8942307692307693, 'recall': 0.93, 'f1': 0.911764705882353, 'accuracy': 0.91, 'auroc': 0.91}
Result Class 1 (Test set): {'precision': 0.9270833333333334, 'recall': 0.89, 'f1': 0.9081632653061226, 'accuracy': 0.91, 'auroc': 0.91}
hyper-parameter: 1e-05
Result Class 0 (Test set): {'precision': 0.9540816326530612, 'recall': 0.935, 'f1': 0.9444444444444445, 'accuracy': 0.945, 'auroc': 0.945}
Result Cl