In [1]:
import math
import os
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load data

In [2]:
data_path = "./data/"
path_train = os.path.join(data_path, "train.csv")
#path_x_test = os.path.join(data_path, "X_test")
path_test = os.path.join(data_path, "test.csv")

In [3]:
def load_data(path_train, path_test):
    data_train = pd.read_csv(path_train, skipinitialspace = True)
    data_test = pd.read_csv(path_test, skipinitialspace = True)
    return data_train, data_test

# Data preprocessing

In [4]:
from sklearn.feature_selection import chi2
from sklearn.pipeline import make_pipeline

In [4]:
class DataPreprocessor:
    def __init__(self):
        self.train_mean = None
        self.train_std = None
        self.num_cols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
        self.cat_cols = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
        self.all_native_countries = None
        #used for remove less frequent categorical columns
        self.keep_cols = None

    def _transform_label(self, data_train):
        label_dict = {'<=50K': 0, '>50K': 1}
        data_train_ = data_train.copy()
        data_train_["income"] = data_train_["income"].apply(lambda x: label_dict[x])
        return data_train_ 
        
    def _do_one_hot_encoding(self, data_cat: pd.DataFrame, isTraining = False):
        if isTraining:
            self.all_native_countries = data_cat["native_country"].value_counts().index.sort_values().to_list()
            data_one_hot = pd.get_dummies(data_cat)
        else:
            #fix missing columns in testing dataset
            data_cat["native_country"] = data_cat["native_country"].astype(pd.CategoricalDtype(categories=self.all_native_countries))
            data_one_hot = pd.get_dummies(data_cat)
        return data_one_hot
        
    def _normalize_data(self, X_data: pd.DataFrame, isTraining = False):
        if isTraining:
            self.train_mean = X_data.mean(axis = 0)
            self.train_std = X_data.std(axis = 0)
        normalized_data = (X_data - self.train_mean) / self.train_std
        return normalized_data

    def _remove_less_frequent_cat_features(self, X_data_cat: pd.DataFrame, lower_bound_ratio = 0.5, isTraining = False):
        if isTraining:
            cols = X_data_cat.columns
            keep_cols_bool = (X_data_cat.sum(axis = 0) / X_data_cat.shape[0]) > lower_bound_ratio
            self.keep_cols = cols[keep_cols_bool]
        return X_data_cat[self.keep_cols]

    def preprocess_train_data(self, data_train: pd.DataFrame):
        #avoid changes in original dataset
        data_train_ = self._transform_label(data_train)
        #split data into numerical columns and categorical columns
        data_train_num = data_train_[self.num_cols]
        data_train_cat = data_train_[self.cat_cols]
        y_train = np.array(data_train_["income"])

        #preprocessing - numerical
        data_train_num = self._normalize_data(data_train_num, isTraining=True)

        #preprocessing - categorical
        data_train_cat = self._do_one_hot_encoding(data_train_cat, isTraining=True)
        #data_train_cat = self._remove_less_frequent_cat_features(data_train_cat, isTraining=True)
        #combine
        data_train_preprocessed = pd.concat([data_train_num, data_train_cat], axis = 1)
        #X_train = np.array(data_train_preprocessed)
        return data_train_preprocessed, y_train

    def preprocess_test_data(self, data_test: pd.DataFrame):
        #split data into numerical columns and categorical columns
        data_test_num = data_test[self.num_cols]
        data_test_cat = data_test[self.cat_cols]

        #preprocessing - numerical
        data_test_num = self._normalize_data(data_test_num, isTraining=False)

        #preprocessing - categorical
        data_test_cat = self._do_one_hot_encoding(data_test_cat, isTraining=False)
        #data_test_cat = self._remove_less_frequent_cat_features(data_test_cat, isTraining=False)

        #combine
        data_test_preprocessed = pd.concat([data_test_num, data_test_cat], axis = 1)
        #X_test = np.array(data_test_preprocessed)
        return data_test_preprocessed

In [5]:
data_train, data_test = load_data(path_train, path_test)
DP = DataPreprocessor()
df_train, y_train = DP.preprocess_train_data(data_train)
df_test = DP.preprocess_test_data(data_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
all_cols_train = df_train.columns
X_train = np.array(df_train)
X_test = np.array(df_test)

In [7]:
print(X_train.shape)
print(X_test.shape)
print(X_train)

(32561, 108)
(16281, 108)
[[ 0.03067009 -1.06359441  1.13472134 ...  1.          0.
   0.        ]
 [ 0.83709613 -1.00869151  1.13472134 ...  1.          0.
   0.        ]
 [-0.04264137  0.24507474 -0.42005317 ...  1.          0.
   0.        ]
 ...
 [ 1.42358779 -0.3587719  -0.42005317 ...  1.          0.
   0.        ]
 [-1.2156247   0.11095818 -0.42005317 ...  1.          0.
   0.        ]
 [ 0.98371904  0.9298783  -0.42005317 ...  1.          0.
   0.        ]]


# Feature selection

## Embedded methods
- LassoCV
- RidgeCV
- ElasticNetCV

In [8]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import SelectFromModel

In [9]:
clf_l1 = LogisticRegressionCV(max_iter = 100, cv = 5, penalty='l1', solver='saga')
clf_l1.fit(X_train, y_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l1', random_state=None, refit=True, scoring=None,
                     solver='saga', tol=0.0001, verbose=0)

In [10]:
coef_l1=pd.Series(clf_l1.coef_.flatten(),index=all_cols_train)
coef_l1_value_sort = abs(coef_l1).sort_values()

In [11]:
clf_l2 = LogisticRegressionCV(max_iter = 100, cv = 5, penalty='l2', solver='saga')
clf_l2.fit(X_train, y_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='saga', tol=0.0001, verbose=0)

In [12]:
coef_l2=pd.Series(clf_l2.coef_.flatten(),index=all_cols_train)
coef_l2_value_sort = abs(coef_l2).sort_values()

In [13]:
l1_ratio_grid = np.linspace(0, 1, num = 10)
clf_EN = LogisticRegressionCV(max_iter = 100, refit = True
                    , penalty='elasticnet', cv=5
                    , l1_ratios = l1_ratio_grid, solver = 'saga')
clf_EN.fit(X_train, y_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0,
                     l1_ratios=array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ]),
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='elasticnet', random_state=None, refit=True,
                     scoring=None, solver='saga', tol=0.0001, verbose=0)

In [14]:
coef_EN = pd.Series(clf_EN.coef_.flatten(),index=all_cols_train)
coef_EN_value_sort = abs(coef_EN).sort_values()

In [15]:
#select half features in each model
selector_l1 = SelectFromModel(clf_l1, prefit=True, threshold="median")
selector_l2 = SelectFromModel(clf_l2, prefit=True, threshold="median")
selector_EN = SelectFromModel(clf_EN, prefit=True, threshold="median")

In [16]:
features_select_by_l1 = set(all_cols_train[selector_l1.get_support()])
features_select_by_l2 = set(all_cols_train[selector_l2.get_support()])
features_select_by_EN = set(all_cols_train[selector_EN.get_support()])
features_intersect = set.intersection(features_select_by_l1, features_select_by_l2, features_select_by_EN)
features_union = set.union(features_select_by_l1, features_select_by_l2, features_select_by_EN)

In [17]:
print(f"# of features selected by l1-logistic regression = {len(features_select_by_l1)}")
print(f"# of features selected by l2-logistic regression = {len(features_select_by_l2)}")
print(f"# of features selected by EN-logistic regression = {len(features_select_by_EN)}")
print(f"# of features selected by intersecting the results above = {len(features_intersect)}")
print(f"# of features selected by unioning the results above = {len(features_union)}")

# of features selected by l1-logistic regression = 54
# of features selected by l2-logistic regression = 54
# of features selected by EN-logistic regression = 54
# of features selected by intersecting the results above = 47
# of features selected by unioning the results above = 60


## Feature selection result
- by l1, l2, elastic net logistic regression
- select 60 features finally

## Embedded methods
- Random forest

# handcraft logistic regression with gradient descent

In [53]:
def write_to_csv(y_pred, file_name):
    path = os.path.join("./submission/", file_name)
    with open(path, 'w', newline='') as csvf:
        # 建立 CSV 檔寫入器
        writer = csv.writer(csvf)
        writer.writerow(['id','label'])
        for i in range(int(y_pred.shape[0])):
            writer.writerow([i + 1, int(y_pred[i])])

# Training with all data

## Modeling
- baseline: 75.92%
- logistic regression: 85.9%(85.38%)

## Logistic regression CV (sklearn)
- 可以自動選取最好的params grid
    - Cs
    - l1_ratios
- 其他參數
    - penalty: {‘l1’, ‘l2’, ‘elasticnet’, ‘none’}, default=’l2’
    - max_iter: int, default=100
    - n_jobs: default=None, **請設置為-1**
    - warm_start: default=False, **請設置為True**, 讓模型可以不用重新訓練
    - l1_ratio: float, default=None, Setting l1_ratio=0 is equivalent to using penalty='l2', while setting l1_ratio=1 is equivalent to using penalty='l1'. For 0 < l1_ratio <1, the penalty is a combination of L1 and L2.
    - Cs: int or list of floats, default=10, describes the inverse of regularization strength
    - cv: int or cross-validation generator, default=None, The default cross-validation generator used is **Stratified K-Folds**.

## Attributes
- Cs_: Array of C i.e. inverse of regularization parameter values used for cross-validation.
- coefs_paths_: 不同參數組合下的coef
- scores_: values as the grid of scores obtained during cross-validating each fold

In [91]:
#refit: 用最好的超參數重新訓練一遍
#penalty='elasticnet': combine l1 and l2
#5-fold cross validation
#自動tune Cs, regularization的強度
l1_ratio_grid = np.linspace(0, 1, num = 10)
clf = LogisticRegressionCV(max_iter = 200, refit = True
                    , penalty='elasticnet', cv=5
                    , l1_ratios = l1_ratio_grid, solver = 'saga')
clf.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0,
                     l1_ratios=array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ]),
                     max_iter=200, multi_class='auto', n_jobs=None,
                     penalty='elasticnet', random_state=None, refit=True,
                     scoring=None, solver='saga', tol=0.0001, verbose=0)

In [93]:
#shape = (n_folds, n_cs)
clf.scores_

{'>50K': array([[[0.7673883 , 0.76262859, 0.75909719, 0.75909719, 0.75909719,
          0.75909719, 0.75909719, 0.75909719, 0.75909719, 0.75909719],
         [0.8011669 , 0.8011669 , 0.79932443, 0.79579303, 0.79348994,
          0.78980501, 0.78765546, 0.7842776 , 0.78182097, 0.77706126],
         [0.81621373, 0.81575311, 0.81513895, 0.81421772, 0.81467834,
          0.81498541, 0.81606019, 0.81621373, 0.81590665, 0.81559957],
         [0.81943805, 0.81928451, 0.81913097, 0.81928451, 0.81959159,
          0.81974513, 0.81989866, 0.82035928, 0.82051282, 0.82066636],
         [0.82128052, 0.82143405, 0.82143405, 0.82158759, 0.82143405,
          0.82143405, 0.82112698, 0.82112698, 0.82097344, 0.82097344],
         [0.82143405, 0.82143405, 0.82143405, 0.82143405, 0.82143405,
          0.82143405, 0.82143405, 0.82158759, 0.82158759, 0.82158759],
         [0.82143405, 0.82143405, 0.82143405, 0.82143405, 0.82143405,
          0.82143405, 0.82143405, 0.82143405, 0.82143405, 0.82143405],
     

In [88]:
print(f"Training Accuracy = {round(clf.score(X_train, y_train) * 100, 4)}%")

Testing Accuracy = 82.3715%


# Training with intersect features (47 features)

In [20]:
features_intersect_list = list(features_intersect)
df_intersect = df_train[features_intersect_list]
X_train_intersect = np.array(df_intersect)

## Logistic regression (sklearn)

In [198]:
#refit: 用最好的超參數重新訓練一遍
#penalty='elasticnet': combine l1 and l2
#5-fold cross validation
#自動tune Cs, regularization的強度
l1_ratio_grid = np.linspace(0, 1, num = 10)
clf = LogisticRegressionCV(max_iter = 200, refit = True
                    , penalty='elasticnet', cv=5
                    , l1_ratios = l1_ratio_grid, solver = 'saga')
clf.fit(X_train_intersect, y_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0,
                     l1_ratios=array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ]),
                     max_iter=200, multi_class='auto', n_jobs=None,
                     penalty='elasticnet', random_state=None, refit=True,
                     scoring=None, solver='saga', tol=0.0001, verbose=0)

In [200]:
print(f"Training Accuracy = {round(clf.score(X_train_intersect, y_train) * 100, 4)}%")

Training Accuracy = 84.936%


## handcraft logistic regression 

In [23]:
from handcraft_logistic_regression import LogisticRegression

In [24]:
batch_size_list = [128, 256, 512, 1024, 2048]
epoch_size_list = [20, 30, 50, 100]
learning_rate_list = [0.01, 0.05, 0.1, 0.5, 1]
loss_list = []
acc_list = []

#Model saving
best_model = None
global_best_loss = 99999
global_best_acc = 0

#Hyperparameter saving
best_batch_size = 0
best_epoch_size = 0
best_learning_rate = 0

for batch_size in batch_size_list:
    for epoch_size in epoch_size_list:
        for learning_rate in learning_rate_list:
            print("=" * 10,"Start training model", "="*10)
            print(f"batch size = {batch_size}, epoch_size = {epoch_size}, learningrate = {learning_rate}")
            LR = LogisticRegression()
            LR.train(X_train_intersect, y_train, batch_size, epoch_size, learning_rate)
            loss_list.append(LR.best_valid_loss)
            acc_list.append(LR.best_valid_acc)
            if LR.best_valid_loss < global_best_loss:
                best_model = LR
                best_batch_size = batch_size
                best_stop_epoch_size = LR.best_epoch
                best_epoch_size = epoch_size
                best_learning_rate = learning_rate
                global_best_loss = LR.best_valid_loss
                global_best_acc = LR.best_valid_acc

batch size = 128, epoch_size = 20, learningrate = 0.01
Epoch 1, train loss = 15.0266 (Accuracy: 45.617%), valid loss = 15.1491 (Accuracy: 45.174%)
Epoch 2, train loss = 13.4615 (Accuracy: 51.281%), valid loss = 13.2371 (Accuracy: 52.093%)
Epoch 3, train loss = 12.1134 (Accuracy: 56.16%), valid loss = 11.9784 (Accuracy: 56.649%)
Epoch 4, train loss = 11.0563 (Accuracy: 59.986%), valid loss = 11.2911 (Accuracy: 59.136%)
Epoch 5, train loss = 10.124 (Accuracy: 63.36%), valid loss = 10.5699 (Accuracy: 61.746%)
Epoch 6, train loss = 9.5033 (Accuracy: 65.606%), valid loss = 9.3933 (Accuracy: 66.005%)
Epoch 7, train loss = 8.8087 (Accuracy: 68.12%), valid loss = 8.7314 (Accuracy: 68.4%)
Epoch 8, train loss = 8.2268 (Accuracy: 70.226%), valid loss = 8.3807 (Accuracy: 69.669%)
Epoch 9, train loss = 7.8073 (Accuracy: 71.744%), valid loss = 7.6707 (Accuracy: 72.239%)
Epoch 10, train loss = 7.3321 (Accuracy: 73.464%), valid loss = 7.419 (Accuracy: 73.15%)
Epoch 11, train loss = 7.0472 (Accuracy: 7

In [26]:
print("=" * 10, " Best Model result ", "=" * 10)
print(f"Batch Size = {best_batch_size}, Epoch Size = {best_epoch_size} (Actually running {best_stop_epoch_size} epoch), Learning rate = {best_learning_rate}")
print(f"Validation loss = {round(global_best_loss, 3)} (Accuracy: {round(global_best_acc*100, 3)}%)")

Batch Size = 512, Epoch Size = 30 (Actually running 21 epoch), Learning rate = 0.5
Validation loss = 3.9 (Accuracy: 85.884%)


In [27]:
LR_final = LogisticRegression()
LR_final.train_with_full_data(X_train_intersect, y_train, best_batch_size, best_stop_epoch_size, best_learning_rate, verbose=True)

Epoch 1, train loss = 4.2099 (Accuracy: 84.764%)
Epoch 2, train loss = 4.2048 (Accuracy: 84.782%)
Epoch 3, train loss = 4.2039 (Accuracy: 84.785%)
Epoch 4, train loss = 4.1912 (Accuracy: 84.832%)
Epoch 5, train loss = 4.2031 (Accuracy: 84.789%)
Epoch 6, train loss = 4.187 (Accuracy: 84.847%)
Epoch 7, train loss = 4.2005 (Accuracy: 84.798%)
Epoch 8, train loss = 4.1683 (Accuracy: 84.914%)
Epoch 9, train loss = 4.1844 (Accuracy: 84.856%)
Epoch 10, train loss = 4.1793 (Accuracy: 84.875%)
Epoch 11, train loss = 4.1904 (Accuracy: 84.835%)
Epoch 12, train loss = 4.1785 (Accuracy: 84.878%)
Epoch 13, train loss = 4.1691 (Accuracy: 84.911%)
Epoch 14, train loss = 4.1844 (Accuracy: 84.856%)
Epoch 15, train loss = 4.1657 (Accuracy: 84.924%)
Epoch 16, train loss = 4.164 (Accuracy: 84.93%)
Epoch 17, train loss = 4.1734 (Accuracy: 84.896%)
Epoch 18, train loss = 4.1717 (Accuracy: 84.902%)
Epoch 19, train loss = 4.164 (Accuracy: 84.93%)
Epoch 20, train loss = 4.1615 (Accuracy: 84.939%)
Epoch 21, trai

In [None]:
df_test_intersect = df_test[features_intersect_list]
X_test_intersect = np.array(df_test_intersect)
y_pred = LR_final.predict(X_test_intersect)

In [None]:
#記得改名字
file_name = 'submission_1023_3.csv'
write_to_csv(y_pred, file_name)

## Random forest

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [196]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(X_train_intersect, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [197]:
RFC.score(X_train_intersect, y_train)

0.9546696968766315

# Training with union features (60 features)

In [37]:
features_union = ['native_country_Nicaragua',
 'occupation_Prof-specialty',
 'workclass_Self-emp-not-inc',
 'occupation_Priv-house-serv',
 'native_country_Canada',
 'native_country_China',
 'workclass_Federal-gov',
 'education_Preschool',
 'marital_status_Never-married',
 'hours_per_week',
 'native_country_Mexico',
 'native_country_Dominican-Republic',
 'capital_gain',
 'occupation_Other-service',
 'workclass_Self-emp-inc',
 'native_country_Greece',
 'marital_status_Married-AF-spouse',
 'native_country_Columbia',
 'occupation_Sales',
 'education_1st-4th',
 'occupation_Exec-managerial',
 'sex_Male',
 'native_country_Outlying-US(Guam-USVI-etc)',
 'education_Assoc-acdm',
 'marital_status_Married-civ-spouse',
 'sex_Female',
 'marital_status_Married-spouse-absent',
 'occupation_Farming-fishing',
 'occupation_Tech-support',
 'education_num',
 'education_Prof-school',
 'native_country_Peru',
 'education_5th-6th',
 'native_country_Yugoslavia',
 'native_country_Philippines',
 'marital_status_Divorced',
 'native_country_Germany',
 'relationship_Wife',
 'native_country_Japan',
 'marital_status_Separated',
 'native_country_France',
 'native_country_Cambodia',
 'workclass_Without-pay',
 'relationship_Own-child',
 'native_country_Italy',
 'native_country_South',
 'relationship_Other-relative',
 'native_country_Cuba',
 'workclass_Private',
 'occupation_Handlers-cleaners',
 'native_country_England',
 'occupation_Protective-serv',
 'race_Asian-Pac-Islander',
 'age',
 'capital_loss',
 'native_country_El-Salvador',
 'native_country_Ireland',
 'native_country_United-States',
 'native_country_Vietnam',
 'race_Amer-Indian-Eskimo']

## handcraft logistic regression 

In [15]:
batch_size_list = [128, 256, 512, 1024, 2048]
epoch_size_list = [20, 30, 50, 100]
learning_rate_list = [0.01, 0.05, 0.1, 0.5, 1]
loss_list = []
acc_list = []

#Model saving
best_model = None
global_best_loss = 99999
global_best_acc = 0

#Hyperparameter saving
best_batch_size = 0
best_epoch_size = 0
best_learning_rate = 0

for batch_size in batch_size_list:
    for epoch_size in epoch_size_list:
        for learning_rate in learning_rate_list:
            print("=" * 10,"Start training model", "="*10)
            print(f"batch size = {batch_size}, epoch_size = {epoch_size}, learningrate = {learning_rate}")
            LR = LogisticRegression()
            LR.train(X_train_union, y_train, batch_size, epoch_size, learning_rate)
            loss_list.append(LR.best_valid_loss)
            acc_list.append(LR.best_valid_acc)
            if LR.best_valid_loss < global_best_loss:
                best_model = LR
                best_batch_size = batch_size
                best_stop_epoch_size = LR.best_epoch
                best_epoch_size = epoch_size
                best_learning_rate = learning_rate
                global_best_loss = LR.best_valid_loss
                global_best_acc = LR.best_valid_acc

batch size = 128, epoch_size = 20, learningrate = 0.01
Epoch 1, train loss = 18.1107 (Accuracy: 34.455%), valid loss = 17.822 (Accuracy: 35.5%)
Epoch 2, train loss = 14.9878 (Accuracy: 45.757%), valid loss = 15.2708 (Accuracy: 44.733%)
Epoch 3, train loss = 12.8129 (Accuracy: 53.628%), valid loss = 12.6007 (Accuracy: 54.397%)
Epoch 4, train loss = 10.9423 (Accuracy: 60.398%), valid loss = 10.9942 (Accuracy: 60.211%)
Epoch 5, train loss = 9.6827 (Accuracy: 64.957%), valid loss = 9.4894 (Accuracy: 65.657%)
Epoch 6, train loss = 8.7529 (Accuracy: 68.322%), valid loss = 8.5702 (Accuracy: 68.984%)
Epoch 7, train loss = 7.9006 (Accuracy: 71.407%), valid loss = 8.0158 (Accuracy: 70.99%)
Epoch 8, train loss = 7.4218 (Accuracy: 73.14%), valid loss = 7.2493 (Accuracy: 73.764%)
Epoch 9, train loss = 6.9259 (Accuracy: 74.934%), valid loss = 7.04 (Accuracy: 74.521%)
Epoch 10, train loss = 6.6495 (Accuracy: 75.935%), valid loss = 6.4884 (Accuracy: 76.518%)
Epoch 11, train loss = 6.3756 (Accuracy: 76

In [29]:
print("=" * 10, " Best Model result ", "=" * 10)
print(f"Batch Size = {best_batch_size}, Epoch Size = {best_epoch_size} (Actually running {best_stop_epoch_size} epoch), Learning rate = {best_learning_rate}")
print(f"Validation loss = {round(global_best_loss, 3)} (Accuracy: {round(global_best_acc*100, 3)}%)")

Batch Size = 512, Epoch Size = 30 (Actually running 21 epoch), Learning rate = 0.5
Validation loss = 3.9 (Accuracy: 85.884%)


In [17]:
count = 0
for batch_size in batch_size_list:
    for epoch_size in epoch_size_list:
        for learning_rate in learning_rate_list:
            loss = loss_list[count]
            acc = acc_list[count]
            print(f"When BS = {batch_size}, ES = {epoch_size}, LR = {learning_rate}, loss = {round(loss, 4)} (Accuracy = {round(acc*100, 3)}%)")
            count+=1

When BS = 128, ES = 20, LR = 0.01, loss = 5.0488 (Accuracy = 81.728%)
When BS = 128, ES = 20, LR = 0.05, loss = 4.0532 (Accuracy = 85.331%)
When BS = 128, ES = 20, LR = 0.1, loss = 4.0362 (Accuracy = 85.393%)
When BS = 128, ES = 20, LR = 0.5, loss = 4.0079 (Accuracy = 85.495%)
When BS = 128, ES = 20, LR = 1, loss = 4.0673 (Accuracy = 85.28%)
When BS = 128, ES = 30, LR = 0.01, loss = 4.7376 (Accuracy = 82.854%)
When BS = 128, ES = 30, LR = 0.05, loss = 4.0843 (Accuracy = 85.219%)
When BS = 128, ES = 30, LR = 0.1, loss = 4.0645 (Accuracy = 85.29%)
When BS = 128, ES = 30, LR = 0.5, loss = 3.923 (Accuracy = 85.802%)
When BS = 128, ES = 30, LR = 1, loss = 3.9174 (Accuracy = 85.822%)
When BS = 128, ES = 50, LR = 0.01, loss = 4.1917 (Accuracy = 84.83%)
When BS = 128, ES = 50, LR = 0.05, loss = 4.0249 (Accuracy = 85.434%)
When BS = 128, ES = 50, LR = 0.1, loss = 3.9372 (Accuracy = 85.751%)
When BS = 128, ES = 50, LR = 0.5, loss = 3.9796 (Accuracy = 85.597%)
When BS = 128, ES = 50, LR = 1, loss

In [49]:
LR_final = LogisticRegression()
LR_final.train_with_full_data(X_train_intersect, y_train, best_batch_size, best_stop_epoch_size, best_learning_rate, verbose=True)

Epoch 1, train loss = 4.1623 (Accuracy: 84.936%)
Epoch 2, train loss = 4.1106 (Accuracy: 85.123%)
Epoch 3, train loss = 4.1191 (Accuracy: 85.093%)
Epoch 4, train loss = 4.1216 (Accuracy: 85.083%)
Epoch 5, train loss = 4.1165 (Accuracy: 85.102%)
Epoch 6, train loss = 4.1309 (Accuracy: 85.05%)
Epoch 7, train loss = 4.0699 (Accuracy: 85.271%)
Epoch 8, train loss = 4.1072 (Accuracy: 85.136%)
Epoch 9, train loss = 4.0945 (Accuracy: 85.182%)


In [50]:
df_test_union = df_test[features_union]
X_test_union = np.array(df_test_union)

In [51]:
y_pred = LR_final.predict(X_test_union)

In [None]:
#記得改名字
file_name = 'submission_1023_2.csv'
write_to_csv(y_pred, file_name)

## Random forest

In [31]:
from sklearn.model_selection import RandomizedSearchCV

In [32]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [37]:
RFC = RandomForestClassifier()
RFC_CV = RandomizedSearchCV(estimator = RFC, param_distributions = random_grid, 
n_iter = 100, cv = 5, verbose=2, random_state=42)
RFC_CV.fit(X_train_intersect, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, total=  12.8s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.7s remaining:    0.0s


[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, total=  12.0s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, total=  14.2s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, total=  12.6s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, total=  11.7s
[CV] n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=True 
[CV]  n_estimators=2000, min_sample

MemoryError: could not allocate 458752 bytes



[CV]  n_estimators=1800, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=False, total=  49.5s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=False 


numpy.core._exceptions._ArrayMemoryError: Unable to allocate 204. KiB for an array with shape (26049,) and data type float64



[CV]  n_estimators=1800, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=False, total=  27.9s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=False 
[CV]  n_estimators=1800, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=False, total=  57.5s
[CV] n_estimators=2000, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=50, bootstrap=False 
[CV]  n_estimators=2000, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=50, bootstrap=False, total= 1.0min
[CV] n_estimators=2000, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=50, bootstrap=False 
[CV]  n_estimators=2000, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=50, bootstrap=False, total=  56.5s
[CV] n_estimators=2000, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=50, bootstrap=False 
[CV]  n_estimat

MemoryError: could not allocate 917504 bytes



[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=100, bootstrap=True, total=  32.2s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=100, bootstrap=True 
[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=100, bootstrap=True, total=  46.2s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=100, bootstrap=True 
[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=100, bootstrap=True, total=  37.7s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=100, bootstrap=True 
[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=100, bootstrap=True, total=  40.1s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=100, bootstrap=True 
[CV]  n_estimators=1

In [1]:
RFC_CV.best_params_

NameError: name 'RFC_CV' is not defined

In [None]:
RFC_CV.best_params_

In [None]:
import pickle
with open('model/RFC_CV_1023.pickle', 'wb') as f:
    pickle.dump(RFC_CV, f)

## Neural network

## Ensembling