# B06702064 會計五 林聖硯
# Logistic regression


In [62]:
import numpy as np
import pandas as pd
import math
import os
import csv

In [63]:
path_train = "train.csv"
path_test = "test.csv"

In [64]:
def load_data(path_train, path_test):
    data_train = pd.read_csv(path_train, skipinitialspace = True)
    data_test = pd.read_csv(path_test, skipinitialspace = True)
    return data_train, data_test

# Data Preprocessing

In [67]:
class DataPreprocessor:
    def __init__(self):
        self.train_mean = None
        self.train_std = None
        self.num_cols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
        self.cat_cols = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
        self.all_native_countries = None

    def transform_label(self, data_train):
        label_dict = {'<=50K': 0, '>50K': 1}
        data_train["income"] = data_train["income"].apply(lambda x: label_dict[x])
        return data_train 
        
    def do_one_hot_encoding(self, data_cat: pd.DataFrame, isTraining = False):
        if isTraining:
            self.all_native_countries = data_cat["native_country"].value_counts().index.sort_values().to_list()
            data_one_hot = pd.get_dummies(data_cat)
        else:
            #fix missing columns in testing dataset
            data_cat["native_country"] = data_cat["native_country"].astype(pd.CategoricalDtype(categories=self.all_native_countries))
            data_one_hot = pd.get_dummies(data_cat)
        return data_one_hot
        
    def normalize_data(self, X_data: pd.DataFrame, isTraining = False):
        if isTraining:
            self.train_mean = X_data.mean(axis = 0)
            self.train_std = X_data.std(axis = 0)
        normalized_data = (X_data - self.train_mean) / self.train_std
        return normalized_data

    def standardize_data(self, X_data: pd.DataFrame, isTraining = False):
        if isTraining:
            self.train_min = X_data.min(axis = 0)
            self.train_range = X_data.max(axis = 0) - X_data.min(axis = 0)
        standardized_data = (X_data - self.train_min) / self.train_range
        return standardized_data

    def preprocess_train_data(self, data_train: pd.DataFrame):
        data_train = self.transform_label(data_train)
        #split data into numerical columns and categorical columns
        data_train_num = data_train[self.num_cols]
        data_train_cat = data_train[self.cat_cols]
        y_train = np.array(data_train["income"])

        #preprocessing - numerical
        #data_train_num = self.normalize_data(data_train_num, isTraining=True)
        data_train_num = self.standardize_data(data_train_num, isTraining=True)
        
        #preprocessing - categorical
        data_train_cat = self.do_one_hot_encoding(data_train_cat, isTraining=True)

        #combine
        data_train = pd.concat([data_train_num, data_train_cat], axis = 1)
        X_train = np.array(data_train)
        
        return X_train, y_train

    def preprocess_test_data(self, data_test: pd.DataFrame):
        #split data into numerical columns and categorical columns
        data_test_num = data_test[self.num_cols]
        data_test_cat = data_test[self.cat_cols]

        #preprocessing - numerical
        #data_test_num = self.normalize_data(data_test_num, isTraining=False)
        data_test_num = self.standardize_data(data_test_num, isTraining=False)

        #preprocessing - categorical
        data_test_cat = self.do_one_hot_encoding(data_test_cat, isTraining=False)

        #combine
        data_test = pd.concat([data_test_num, data_test_cat], axis = 1)
        X_test = np.array(data_test)
        return X_test

In [68]:
data_train, data_test = load_data(path_train, path_test)
DP = DataPreprocessor()
X_train, y_train= DP.preprocess_train_data(data_train)
X_test= DP.preprocess_test_data(data_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Modeling

In [70]:
class LogisticRegression:
    def __init__(self):
        self.train_valid_ratio = 0.7
        self.train_acc_list = list()
        self.train_loss_list = list()
        self.valid_acc_list = list()
        self.valid_loss_list = list()
        #Best weights
        self.best_w = None
        self.best_b = None
        #Best results
        self.best_epoch = None
        self.best_valid_loss = None
        self.best_valid_acc = None

    def initialize_params(self, x):
        w = np.random.rand(X_train.shape[1])
        bias = np.random.rand()
        return w, bias

    def train(self, X, y, batch_size, epoch_size, learning_rate, verbose = True):
        w, b = self.initialize_params(X)
        #adagrad params
        eps = 1e-12
        g_b = 0
        g_w = np.ones(X.shape[1])

        #other hyperparams
        best_valid_loss = 99999
        patience = 10 #for early stopping
        
        for num_epoch in range(1, epoch_size+1):
            #Shuffle when each epoch begin
            index = np.arange(X.shape[0])
            np.random.shuffle(index)
            X = X[index]
            y = y[index]
            split_point_x = math.floor(X.shape[0] * self.train_valid_ratio)
            split_point_y = math.floor(y.shape[0] * self.train_valid_ratio)
            X_train = X[:split_point_x, :]
            y_train = y[:split_point_y]
            X_valid = X[split_point_x:, :] 
            y_valid = y[split_point_y:]

            for num_batch in range(int(X_train.shape[0] / batch_size)):
                #print("start")
                x_batch = X_train[num_batch * batch_size:(num_batch + 1) * batch_size]
                y_batch = y_train[num_batch * batch_size:(num_batch + 1) * batch_size]

                #implement adagrad
                w_grad, b_grad = self.compute_gradient(x_batch, y_batch, w, b)
                g_w += w_grad ** 2
                g_b += b_grad ** 2

                w = w - learning_rate * w_grad / np.sqrt(g_w + eps)
                b = b - learning_rate * b_grad / np.sqrt(g_b + eps)
                
            #compute loss 
            y_train_pred = np.round(self.compute_logistic_value(X_train, w, b))
            train_acc = self.compute_accuracy(y_train_pred, y_train)
            train_loss = self.compute_cross_entropy_loss(y_train_pred, y_train) / X_train.shape[0]
            self.train_acc_list.append(train_acc)
            self.train_loss_list.append(train_loss)

            y_valid_pred = np.round(self.compute_logistic_value(X_valid, w, b))
            valid_acc = self.compute_accuracy(y_valid_pred, y_valid)
            valid_loss = self.compute_cross_entropy_loss(y_valid_pred, y_valid) / X_valid.shape[0]
            self.valid_acc_list.append(valid_acc)
            self.valid_loss_list.append(valid_loss)

            if verbose:
                print(f"Epoch {num_epoch}, train loss = {round(train_loss, 4)} (Accuracy: {round(train_acc*100, 3)}%), valid loss = {round(valid_loss, 4)} (Accuracy: {round(valid_acc*100, 3)}%)")
        
            #save best result
            if valid_loss < best_valid_loss:
                self.best_w = w
                self.best_b = b
                self.best_epoch = num_epoch
                best_valid_loss = valid_loss
                best_valid_acc = valid_acc
                self.best_valid_loss = best_valid_loss
                self.best_valid_acc = best_valid_acc

            #early stopping
            if valid_loss > best_valid_loss and num_epoch >= self.best_epoch + patience:
                self.stop_epoch = self.best_epoch + patience
                if verbose:
                    print("Early Stopping!")
                    print("="*10 + "validation result" + "="*10)
                    print(f"Best epoch is {self.best_epoch} with minimum valid loss = {round(best_valid_loss, 4)} (Accuracy: {round(best_valid_acc*100, 3)}%)")
                return

        self.stop_epoch = num_epoch
        if verbose:
            print("Finish model tuning")
            print("="*10 + "Model result" + "="*10)
            print(f"Best epoch is {self.best_epoch} with minimum valid loss = {round(best_valid_loss, 4)} (Accuracy: {round(best_valid_acc*100, 3)}%)")

    def predict(self, X_test):
        y_pred = self.compute_logistic_value(X_test, self.best_w, self.best_b)
        y_pred = np.round(y_pred)
        return y_pred

    def compute_gradient(self, X, y_true, w, b):
        #print(w.shape)
        y_pred = self.compute_logistic_value(X, w, b).flatten() #dim = (batch_size, )
        pred_error = y_true - y_pred
        w_grad = -np.dot(X.T, pred_error) #dim = (feature_size, )
        b_grad = -pred_error.sum(axis = 0)
        return w_grad, b_grad

    def compute_logistic_value(self, X, w, b):
        return self.sigmoid(np.matmul(X, w) + b)

    def compute_cross_entropy_loss(self, y_pred, y_true):
        eps = 1e-12
        y_pred = np.clip(y_pred, eps, 1-eps)
        cross_entropy = -np.dot(y_true, np.log(y_pred )) - np.dot((1-y_true), np.log(1 - y_pred))
        return cross_entropy

    def compute_accuracy(self, y_pred, y_true):
        accuracy = 1 - np.mean(np.abs(y_pred - y_true))
        return accuracy

    def sigmoid(self, z):
        res = 1 / (1.0 + np.exp(-z))
        return np.clip(res, 1e-6, 1 - (1e-6))

In [71]:
batch_size_list = [128, 256, 512, 1024, 2048]
epoch_size_list = [20, 30, 50, 100]
learning_rate_list = [0.01, 0.05, 0.1, 0.5, 1]
loss_list = []
acc_list = []

#Model saving
best_model = None
global_best_loss = 99999
global_best_acc = 0

#Hyperparameter saving
best_batch_size = 0
best_epoch_size = 0
best_learning_rate = 0

for batch_size in batch_size_list:
    for epoch_size in epoch_size_list:
        for learning_rate in learning_rate_list:
            print("=" * 10,"Start training model", "="*10)
            print(f"batch size = {batch_size}, epoch_size = {epoch_size}, learningrate = {learning_rate}")
            LR = LogisticRegression()
            LR.train(X_train, y_train, batch_size, epoch_size, learning_rate)
            loss_list.append(LR.best_valid_loss)
            acc_list.append(LR.best_valid_acc)
            if LR.best_valid_loss < global_best_loss:
                best_model = LR
                best_batch_size = batch_size
                best_stop_epoch_size = LR.best_epoch
                best_epoch_size = epoch_size
                best_learning_rate = learning_rate
                global_best_loss = LR.best_valid_loss
                global_best_acc = LR.best_valid_acc

batch size = 128, epoch_size = 20, learningrate = 0.01
Epoch 1, train loss = 20.9706 (Accuracy: 24.105%), valid loss = 20.9927 (Accuracy: 24.025%)
Epoch 2, train loss = 20.7463 (Accuracy: 24.917%), valid loss = 20.7834 (Accuracy: 24.782%)
Epoch 3, train loss = 19.2346 (Accuracy: 30.388%), valid loss = 19.3324 (Accuracy: 30.034%)
Epoch 4, train loss = 15.9953 (Accuracy: 42.111%), valid loss = 15.9043 (Accuracy: 42.44%)
Epoch 5, train loss = 12.6056 (Accuracy: 54.379%), valid loss = 12.5215 (Accuracy: 54.683%)
Epoch 6, train loss = 9.7434 (Accuracy: 64.738%), valid loss = 9.6506 (Accuracy: 65.073%)
Epoch 7, train loss = 7.9904 (Accuracy: 71.082%), valid loss = 7.8631 (Accuracy: 71.543%)
Epoch 8, train loss = 6.6859 (Accuracy: 75.803%), valid loss = 6.7628 (Accuracy: 75.525%)
Epoch 9, train loss = 6.1949 (Accuracy: 77.58%), valid loss = 5.9765 (Accuracy: 78.37%)
Epoch 10, train loss = 5.8094 (Accuracy: 78.975%), valid loss = 5.9765 (Accuracy: 78.37%)
Epoch 11, train loss = 5.6373 (Accurac

In [76]:
print("=" * 10, " Best Model result ", "=" * 10)
print(f"Batch Size = {best_batch_size}, Epoch Size = {best_epoch_size} (Actually running {best_stop_epoch_size} epoch), Learning rate = {best_learning_rate}")
print(f"Validation loss = {round(global_best_loss, 3)} (Accuracy: {round(global_best_acc*100, 3)}%)")

Batch Size = 512, Epoch Size = 50 (Actually running 37 epoch), Learning rate = 0.5
Validation loss = 3.847 (Accuracy: 86.078%)


In [77]:
y_pred = best_model.predict(X_test)
y_train_pred = best_model.predict(X_train)

# Write to csv

In [74]:
def write_to_csv(y_pred, file_path):
    with open(file_path, 'w', newline='') as csvf:
        writer = csv.writer(csvf)
        writer.writerow(['id','label'])
        for i in range(int(y_pred.shape[0])):
            writer.writerow([i + 1, int(y_pred[i])])

In [75]:
file_name = 'prediction_logistic.csv'
write_to_csv(y_pred, file_name)