In [1]:
import pandas as pd
import numpy as np
import sklearn.model_selection
from sklearn.experimental import enable_hist_gradient_boosting
import sklearn.ensemble
import sklearn.metrics
import joblib
np.set_printoptions(precision=2, suppress=True)

In [2]:
class G:
    # global variables
    pass

In [3]:
G.df_train_full = pd.read_csv('X_train', dtype=np.int32)
G.df_test_full = pd.read_csv('X_test', dtype=np.int32)
G.y_train = np.array(open('Y_train').read().strip('\n').split('\n'),
                     dtype=np.int8)

In [4]:
def normalize(df, means, stds):
    cols = ['age', 'fnlwgt', 'hours_per_week', 'capital_gain', 'capital_loss']
    df = df.copy()
    df[cols] = (df[cols] - means[cols]) / stds[cols]
    return df

def extract(df):
    c = len(df.columns)
    n = len(df)
    d = 1 + c + 5*5 + 5

    X = np.zeros((n, d), dtype=np.float64)
    X[:, 0] = 1  # bias
    X[:, 1:1+c] = df.values

    # quadratic term (including cross product)
    # ['age', 'fnlwgt', 'hours_per_week', 'capital_gain', 'capital_loss']
    idx = np.array([1, 2, 4, 5, 6])
    st = 1 + c
    en = 1 + c + 5*5
    X[:, st:en] = (X[:, idx[:, None]] * X[:, idx[None, :]]).reshape((n, 5*5))
    
    st = 1 + c + 25
    en = 1 + c + 25 + 5
    X[:, st:en] = X[:, idx] ** 3

    return X

def preprocess(df_train, df_test):
    df_all = pd.concat((df_train, df_test))
    means = df_all.mean()
    stds = df_all.std()
    
    df_train = normalize(df_train, means, stds)
    df_test = normalize(df_test, means, stds)
    X_train = extract(df_train)
    X_test = extract(df_test)
    
    print('n,d', X_train.shape)

    return X_train, X_test

G.X_train, G.X_test = preprocess(G.df_train_full, G.df_test_full)
# np.savetxt('a.csv',G.X_train,fmt='%.2f',delimiter=',')

n,d (32561, 137)


In [5]:
clf = sklearn.ensemble.HistGradientBoostingClassifier()
grid = sklearn.model_selection.GridSearchCV(
        clf,
        {
            'learning_rate': [0.001, 0.01, 0.1, 1],
            'random_state': [0],
            'l2_regularization': [0.001, 0.01, 0.1, 1],
        },
        scoring='accuracy',
        n_jobs=4,
        cv=5,
        return_train_score=True,
        refit=True)
grid.fit(G.X_train, G.y_train)
print(grid.cv_results_)

{'mean_fit_time': array([3.99, 4.03, 3.28, 0.91, 4.42, 4.13, 2.85, 0.72, 3.87, 3.99, 2.99,
       0.78, 3.88, 3.96, 3.15, 0.83]), 'std_fit_time': array([0.07, 0.09, 0.34, 0.09, 0.16, 0.16, 0.25, 0.02, 0.07, 0.05, 0.24,
       0.03, 0.03, 0.05, 0.3 , 0.08]), 'mean_score_time': array([0.07, 0.08, 0.07, 0.01, 0.08, 0.08, 0.06, 0.01, 0.07, 0.07, 0.07,
       0.01, 0.08, 0.07, 0.07, 0.02]), 'std_score_time': array([0.  , 0.01, 0.01, 0.  , 0.01, 0.01, 0.01, 0.  , 0.  , 0.01, 0.01,
       0.  , 0.02, 0.  , 0.  , 0.02]), 'param_l2_regularization': masked_array(data=[0.001, 0.001, 0.001, 0.001, 0.01, 0.01, 0.01, 0.01,
                   0.1, 0.1, 0.1, 0.1, 1, 1, 1, 1],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_learning_rate': masked_array(data=[0.001, 0.01, 0.1, 1, 0.001, 0.01, 0.1, 1, 0.001, 0.01,
                   0.1, 1, 0.001,

In [6]:
G.y_test = grid.predict(G.X_test)
df_pred = pd.DataFrame({
    'id': np.arange(1, len(G.X_test)+1),
    'label': G.y_test
})
df_pred.to_csv('submission.csv', index=False)
print(df_pred['label'].values[:100])

[0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0]


In [8]:
# joblib.dump(grid, 'boost/2.pkl')

['boost/2.pkl']