# B06702064 會計五 林聖硯
# Best Model

In [1]:
import os
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler

# Load data

In [2]:
path_train = "train.csv"
path_test = "test.csv"

In [11]:
def load_data(path_train, path_test):
    data_train = pd.read_csv(path_train, skipinitialspace = True)
    data_test = pd.read_csv(path_test, skipinitialspace = True)
    return data_train, data_test


def write_to_csv(y_pred, file_path):
    with open(file_path, 'w', newline='') as csvf:
        writer = csv.writer(csvf)
        writer.writerow(['id','label'])
        for i in range(int(y_pred.shape[0])):
            writer.writerow([i + 1, int(y_pred[i])])

# Data preprocessing

In [4]:
class DataPreprocessor:
    def __init__(self):
        self.num_cols = ["age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week"]
        self.order_cat_cols = ["education_num"]
        self.cat_cols = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
        self.target_ratio = dict()
        self.robust_scaler = None

    def transform_label(self, data_train):
        label_dict = {'<=50K': 0, '>50K': 1}
        data_train_ = data_train.copy()
        data_train_["income"] = data_train_["income"].apply(lambda x: label_dict[x])
        return data_train_ 

    def do_target_encoding(self, data, isTraining=False):
        if isTraining:
            for col in self.cat_cols:
                #print(data[[col, "income"]].groupby([col]).mean())
                self.target_ratio[col] = data[[col, "income"]].groupby([col]).mean()
                data[col] = data[col].apply(lambda x: self.target_ratio[col]["income"][x])
        else:
            for col in self.cat_cols:
                data[col] = data[col].apply(lambda x: self.target_ratio[col]["income"][x])
        return data

    def do_robust_scaling(self, data, isTraining=False):
        if isTraining:
            self.robust_scaler = RobustScaler()
            self.robust_scaler.fit(data)   
        data_scaled = self.robust_scaler.transform(data)
        return data_scaled

    def preprocess_train_data(self, data_train):
        data_train = self.transform_label(data_train)     
        X_train_order = np.array(data_train[self.order_cat_cols])
        
        X_train_num = np.array(data_train[self.num_cols])
        X_train_num_scaled = self.do_robust_scaling(X_train_num, isTraining=True)

        X_train_encoded = self.do_target_encoding(data_train, isTraining=True)
        X_train_cat = np.array(X_train_encoded[self.cat_cols])
        
        #combine
        X_train = np.concatenate([X_train_order, X_train_num_scaled, X_train_cat], axis = 1)
        y_train = data_train["income"]
        return X_train, y_train

    def preprocess_test_data(self, data_test):
        X_test_order = np.array(data_test[self.order_cat_cols])
        
        X_test_num = np.array(data_test[self.num_cols])
        X_test_num_scaled = self.do_robust_scaling(X_test_num, isTraining=False)

        X_test_encoded = self.do_target_encoding(data_test, isTraining=False)
        X_test_cat = np.array(X_test_encoded[self.cat_cols])
        
        #combine
        X_test = np.concatenate([X_test_order, X_test_num_scaled, X_test_cat], axis = 1)
        return X_test

In [5]:
data_train, data_test = load_data(path_train, path_test)
DP = DataPreprocessor()
X_train, y_train = DP.preprocess_train_data(data_train)
X_test = DP.preprocess_test_data(data_test)

# Modeling

In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 250, num = 5)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 10, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
params_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'oob_score': [True]}

In [8]:
RFC = RandomForestClassifier(random_state = 201)
RFC_CV = GridSearchCV(estimator = RFC, param_grid = params_grid, cv = 5, verbose = 2, n_jobs = -1)
RFC_CV.fit(X_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 19.6min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 37.9min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 56.4min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=201,
                                 

In [9]:
RFC_CV.score(X_train, y_train)

0.8772150732471361

# Write to csv

In [12]:
y_pred = RFC_CV.predict(X_test)
file_name = 'prediction.csv'
write_to_csv(y_pred, file_name)