## ML HW2 手把手教學 


In [24]:
import math
import os
import csv
import numpy as np
import pandas as pd

In [2]:
data_path = "./data/"
path_train = os.path.join(data_path, "train.csv")
path_test = os.path.join(data_path, "test.csv")

In [3]:
def load_data(path_train, path_test):
    data_train = pd.read_csv(path_train, skipinitialspace = True)
    data_test = pd.read_csv(path_test, skipinitialspace = True)
    # x_train = pd.read_csv(path_x_train)
    # x_test = pd.read_csv(path_x_test)

    # x_train = x_train.values
    # x_test = x_test.values

    # y_train = pd.read_csv(path_y_train, header = None)
    # y_train = y_train.values
    # y_train = y_train.reshape(-1)
    return data_train, data_test

# Data preprocessing



In [4]:
class DataPreprocessor:
    def __init__(self):
        self.train_mean = None
        self.train_std = None
        self.num_cols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
        self.cat_cols = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
        self.all_native_countries = None

    def transform_label(self, data_train):
        label_dict = {'<=50K': 0, '>50K': 1}
        data_train["income"] = data_train["income"].apply(lambda x: label_dict[x])
        return data_train 
        
    def do_one_hot_encoding(self, data_cat: pd.DataFrame, isTraining = False):
        if isTraining:
            self.all_native_countries = data_cat["native_country"].value_counts().index.sort_values().to_list()
            data_one_hot = pd.get_dummies(data_cat)
        else:
            #fix missing columns in testing dataset
            data_cat["native_country"] = data_cat["native_country"].astype(pd.CategoricalDtype(categories=self.all_native_countries))
            data_one_hot = pd.get_dummies(data_cat)
        return data_one_hot
        
    def normalize_data(self, X_data: pd.DataFrame, isTraining = False):
        if isTraining:
            self.train_mean = X_data.mean(axis = 0)
            self.train_std = X_data.std(axis = 0)
        normalized_data = (X_data - self.train_mean) / self.train_std
        return normalized_data

    def create_idx(self, data_train):
        idx_class1 = data_train[data_train["income"] == 0].index.to_numpy()
        idx_class2 = data_train[data_train["income"] == 1].index.to_numpy()
        return idx_class1, idx_class2

    def preprocess_train_data(self, data_train: pd.DataFrame):
        data_train = self.transform_label(data_train)
        #split data into numerical columns and categorical columns
        data_train_num = data_train[self.num_cols]
        data_train_cat = data_train[self.cat_cols]
        y_train = np.array(data_train["income"])

        #preprocessing - numerical
        data_train_num = self.normalize_data(data_train_num, isTraining=True)

        #preprocessing - categorical
        data_train_cat = self.do_one_hot_encoding(data_train_cat, isTraining=True)

        #combine
        data_train_preprocessed = pd.concat([data_train_num, data_train_cat], axis = 1)
        X_train = np.array(data_train_preprocessed)
        
        #create observation idx for class1 and class2
        idx_class1, idx_class2 = self.create_idx(data_train)
        return X_train, y_train, idx_class1, idx_class2

    def preprocess_test_data(self, data_test: pd.DataFrame):
        #split data into numerical columns and categorical columns
        data_test_num = data_test[self.num_cols]
        data_test_cat = data_test[self.cat_cols]

        #preprocessing - numerical
        data_test_num = self.normalize_data(data_test_num, isTraining=False)

        #preprocessing - categorical
        data_test_cat = self.do_one_hot_encoding(data_test_cat, isTraining=False)

        #combine
        data_test_preprocessed = pd.concat([data_test_num, data_test_cat], axis = 1)
        X_test = np.array(data_test_preprocessed)
        return X_test

In [5]:
data_train, data_test = load_data(path_train, path_test)
DP = DataPreprocessor()
X_train, y_train, idx_class1, idx_class2 = DP.preprocess_train_data(data_train)
X_test= DP.preprocess_test_data(data_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [34]:
np.cov(X_train.T)[:2, :2]

array([[ 1.        , -0.07664587],
       [-0.07664587,  1.        ]])

參考 [上課投影片](https://drive.google.com/file/d/1WKjqkJVPIxYh1REbzy6HeoGfZj-mj6NJ/view) P18 and P23



In [49]:
class GenerativeModel:
    def __init__(self):
        self.mu1 = None
        self.mu2 = None
        self.sigma1 = None
        self.sigma2 = None
        self.sigma_share = None
        self.num_class1 = None
        self.num_class2 = None

    def _compute_params(self, X_train, idx_class1, idx_class2):
        X_class1 = X_train[idx_class1]
        X_class2 = X_train[idx_class2]
        self.mu1 = X_class1.mean(axis = 0)
        self.mu2 = X_class2.mean(axis = 0)
        self.sigma1 = np.cov(X_class1.T)
        self.sigma2 = np.cov(X_class2.T)
        self.num_class1 = X_class1.shape[0]
        self.num_class2 = X_class2.shape[0]
        num_observation = X_train.shape[0]
        self.sigma_share = (self.num_class1 * self.sigma1 + self.num_class2 * self.sigma2) / num_observation

    def _compute_posterior_prob(self, X):
        sigma_inverse = np.linalg.inv(self.sigma_share)

        w = np.dot( (self.mu1-self.mu2), sigma_inverse)
        b = (-0.5) * np.dot(np.dot(self.mu1.T, sigma_inverse), self.mu1) + (0.5) * np.dot(np.dot(self.mu2.T, sigma_inverse), self.mu2) + np.log(float(self.num_class1)/self.num_class2)

        z = np.dot(X, w) + b
        pred = self._sigmoid(z)
        return pred 

    def _sigmoid(self, z):
        res = 1 / (1.0 + np.exp(-z))
        return np.clip(res, 1e-6, 1 - (1e-6))

    def train(self, X_train, y_train, idx_class1, idx_class2):
        self._compute_params(X_train, idx_class1, idx_class2)
        y_pred_train = self._compute_posterior_prob(X_train)
        y_pred_train = np.round(y_pred_train)
        result = (y_pred_train == y_train)
        acc = float(result.sum()) / result.shape[0]
        print(f"Training accuracy = {round(acc*100, 3)}%")
        return 

    def test(self, X_test):
        y_pred_test = self._compute_posterior_prob(X_test)
        y_pred_test = np.round(y_pred_test)
        return y_pred_test

# Predict results


In [50]:
GM = GenerativeModel()
GM.train(X_train, y_train, idx_class1, idx_class2)
y_pred = GM.test(X_test)

Training accuracy = 24.081%


In [46]:
def write_to_csv(y_pred, file_name):
    path = os.path.join("./submission/", file_name)
    with open(path, 'w', newline='') as csvf:
        # 建立 CSV 檔寫入器
        writer = csv.writer(csvf)
        writer.writerow(['id','label'])
        for i in range(int(y_pred.shape[0])):
            writer.writerow([i + 1, int(y_pred[i])])

In [47]:
#記得改名字
file_name = 'submission_1023_1.csv'
write_to_csv(y_pred, file_name)

### Tip for math problem
[p1](https://people.eecs.berkeley.edu/~jrs/189/exam/mids14.pdf)  
[p2&3](https://people.eecs.berkeley.edu/~jordan/courses/260-spring10/other-readings/chapter13.pdf)  
[p3](https://stats.stackexchange.com/questions/351549/maximum-likelihood-estimators-multivariate-gaussian)