In [294]:
import pandas as pd
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier

In [295]:
df = pd.read_csv("train.csv", index_col="ID")
test_df = pd.read_csv("test.csv", index_col="ID")

In [296]:
df.head()

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25150,360000,2,2,2,25,-1,-1,-1,-1,-1,...,3435,8870,2020,12590,3479,3446,8870,2020,27043,0
13470,240000,2,1,2,46,2,2,-2,-1,0,...,2240,1681,2267,0,0,2240,0,2267,3074,0
3092,320000,2,2,1,41,0,0,0,0,0,...,35409,35036,35939,5000,2000,2000,3000,3000,1500,0
13973,50000,2,2,2,24,-1,-1,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,1
10567,80000,1,3,1,52,-1,0,0,0,0,...,20561,21261,21457,1500,1500,1500,1028,876,1449,0


In [297]:
test_df.head()

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10178,60000,2,2,1,30,0,0,0,0,0,...,38117,38874,38667,39544,1628,1618,1375,1403,1520,1205
5304,200000,2,1,2,29,0,0,0,0,2,...,76593,93988,50292,10510,4160,14593,20150,0,5739,12050
5187,230000,1,1,2,39,0,0,0,0,-2,...,20650,0,0,0,2500,2000,0,0,0,0
14495,150000,1,1,2,40,0,0,0,0,0,...,145978,139791,142745,146872,5800,5245,5200,5200,6500,7200
20444,140000,1,2,2,27,0,0,0,0,0,...,76230,77954,78953,81848,10000,10000,3393,3260,5000,3000


In [298]:
original_df = df.copy()
original_test_df = test_df.copy()
df.head()
df.drop("default_payment_next_month", axis=1, inplace=True)

In [299]:
def data_normalize(df, test_df):
    scaling_cols = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4',
            'PAY_AMT5', 'PAY_AMT6']
    for col in scaling_cols:
        scaler = MinMaxScaler()
        df[col] = scaler.fit_transform(df[[col]])
        test_df[col] = scaler.fit_transform(test_df[[col]])
    return df, test_df

In [300]:
df, test_df = data_normalize(df, test_df)

In [301]:
def label_encode(df):
    onehotlabels = pd.get_dummies(df, prefix=None, prefix_sep='_', 
                       dummy_na=False, columns=["SEX","EDUCATION","MARRIAGE"], sparse=False, 
                       drop_first=False, dtype=None)
    return onehotlabels

In [302]:
df.set_index(original_df.index, append=False, inplace=True)

In [303]:
df = label_encode(df)
test_df = label_encode(test_df)

In [304]:
df.head()

Unnamed: 0_level_0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25150,0.443038,0.074074,0.1,0.1,0.1,0.1,0.1,0.1,0.046424,0.101219,...,0,1,0,0,0,0,0,0,1,0
13470,0.291139,0.462963,0.4,0.4,0.0,0.1,0.2,0.1,0.019498,0.085748,...,1,0,0,0,0,0,0,0,1,0
3092,0.392405,0.37037,0.2,0.2,0.2,0.2,0.2,0.2,0.084383,0.151875,...,0,1,0,0,0,0,0,1,0,0
13973,0.050633,0.055556,0.1,0.1,0.0,0.0,0.0,0.0,0.022413,0.085748,...,0,1,0,0,0,0,0,0,1,0
10567,0.088608,0.574074,0.1,0.2,0.2,0.2,0.2,0.2,0.06601,0.108508,...,0,0,1,0,0,0,0,1,0,0


In [305]:
test_df.head()

Unnamed: 0_level_0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10178,0.050505,0.155172,0.2,0.222222,0.2,0.222222,0.222222,0.222222,0.178448,0.06928,...,0,1,0,0,0,0,0,1,0,0
5304,0.191919,0.137931,0.2,0.222222,0.2,0.222222,0.444444,0.222222,0.200585,0.095854,...,1,0,0,0,0,0,0,0,1,0
5187,0.222222,0.310345,0.2,0.222222,0.2,0.222222,0.0,0.0,0.180494,0.072048,...,1,0,0,0,0,0,0,0,1,0
14495,0.141414,0.327586,0.2,0.222222,0.2,0.222222,0.222222,0.222222,0.281176,0.179127,...,1,0,0,0,0,0,0,0,1,0
20444,0.131313,0.103448,0.2,0.222222,0.2,0.222222,0.222222,0.222222,0.217496,0.099379,...,0,1,0,0,0,0,0,0,1,0


In [306]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,SEX_1,SEX_2,EDUCATION_0,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
LIMIT_BAL,1.0,0.143206,-0.272059,-0.297323,-0.290059,-0.269652,-0.253399,-0.235601,0.290539,0.283654,0.288947,0.297446,0.297377,0.291518,0.194995,0.176589,0.206638,0.199472,0.221325,0.215073,-0.029683,0.029683,0.004703,0.256705,-0.145455,-0.139861,0.026093,0.002521,-0.0051,-0.011155,0.10592,-0.093295,-0.056485
AGE,0.143206,1.0,-0.043313,-0.05509,-0.058891,-0.057324,-0.061018,-0.050668,0.054186,0.053819,0.053439,0.049968,0.048377,0.047465,0.025439,0.022501,0.025842,0.026427,0.024296,0.016846,0.092907,-0.092907,0.007441,-0.098169,-0.081018,0.233977,-0.008655,-0.005982,0.035676,0.00955,0.448011,-0.464014,0.078864
PAY_0,-0.272059,-0.043313,1.0,0.669158,0.573421,0.540895,0.506958,0.472545,0.18312,0.184325,0.173635,0.175132,0.176809,0.172138,-0.08534,-0.070403,-0.079509,-0.06583,-0.061725,-0.065042,0.061232,-0.061232,-0.009499,-0.147169,0.106791,0.053991,-0.026075,-0.007753,-0.009089,-0.01701,-0.022476,0.018367,0.026898
PAY_2,-0.297323,-0.05509,0.669158,1.0,0.768036,0.662055,0.621775,0.573462,0.230246,0.229404,0.21719,0.218108,0.217404,0.215069,-0.083156,-0.060032,-0.064262,-0.051836,-0.04199,-0.042252,0.072602,-0.072602,-0.01953,-0.169327,0.125203,0.06217,-0.037024,-0.015357,-0.008569,-0.009044,-0.028271,0.024839,0.020278
PAY_3,-0.290059,-0.058891,0.573421,0.768036,1.0,0.780378,0.692843,0.635996,0.205603,0.233152,0.222866,0.224166,0.222599,0.220174,-0.004458,-0.063249,-0.058944,-0.050562,-0.034179,-0.03905,0.064406,-0.064406,-0.015208,-0.15932,0.119126,0.057203,-0.033782,-0.017192,-0.009284,-0.010627,-0.038916,0.03438,0.026243
PAY_4,-0.269652,-0.057324,0.540895,0.662055,0.780378,1.0,0.826037,0.720236,0.199066,0.2226,0.240683,0.242503,0.240128,0.236392,-0.01407,-0.001616,-0.073125,-0.04696,-0.032112,-0.031699,0.061242,-0.061242,-0.020144,-0.14995,0.113754,0.051916,-0.035481,-0.013954,-0.007661,-0.009963,-0.037515,0.03324,0.024707
PAY_5,-0.253399,-0.061018,0.506958,0.621775,0.692843,0.826037,1.0,0.817923,0.198019,0.219242,0.235586,0.267266,0.265722,0.258212,-0.009026,-0.0032,0.005019,-0.060878,-0.032454,-0.027342,0.05592,-0.05592,-0.017957,-0.138446,0.105579,0.047501,-0.032078,-0.013234,-0.010509,-0.008679,-0.041806,0.038839,0.01772
PAY_6,-0.235601,-0.050668,0.472545,0.573462,0.635996,0.720236,0.817923,1.0,0.202287,0.222233,0.235882,0.262535,0.287272,0.281304,-0.002155,-0.004127,0.003651,0.019439,-0.045257,-0.028347,0.045045,-0.045045,-0.02296,-0.12559,0.101276,0.03885,-0.027656,-0.021402,-0.01459,-0.005606,-0.039281,0.034764,0.024105
BILL_AMT1,0.290539,0.054186,0.18312,0.230246,0.205603,0.199066,0.198019,0.202287,1.0,0.951243,0.889651,0.85647,0.826714,0.798465,0.138656,0.097091,0.131645,0.153106,0.166235,0.1847,0.031595,-0.031595,-0.012471,-0.030248,0.037641,-0.024931,0.00051,0.046123,0.017282,-0.01555,0.027516,-0.023764,-0.011843
BILL_AMT2,0.283654,0.053819,0.184325,0.229404,0.233152,0.2226,0.219242,0.222233,0.951243,1.0,0.926553,0.892248,0.860917,0.829886,0.277851,0.106219,0.131176,0.147061,0.160733,0.179634,0.030287,-0.030287,-0.012772,-0.026876,0.036513,-0.025367,-0.006064,0.041022,0.018089,-0.012944,0.024625,-0.020909,-0.012755


In [307]:
X_train = df

In [308]:
X_train

Unnamed: 0_level_0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25150,0.443038,0.074074,0.1,0.1,0.1,0.1,0.1,0.1,0.046424,0.101219,...,0,1,0,0,0,0,0,0,1,0
13470,0.291139,0.462963,0.4,0.4,0.0,0.1,0.2,0.1,0.019498,0.085748,...,1,0,0,0,0,0,0,0,1,0
3092,0.392405,0.370370,0.2,0.2,0.2,0.2,0.2,0.2,0.084383,0.151875,...,0,1,0,0,0,0,0,1,0,0
13973,0.050633,0.055556,0.1,0.1,0.0,0.0,0.0,0.0,0.022413,0.085748,...,0,1,0,0,0,0,0,0,1,0
10567,0.088608,0.574074,0.1,0.2,0.2,0.2,0.2,0.2,0.066010,0.108508,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6283,0.050633,0.185185,0.3,0.0,0.0,0.0,0.0,0.0,0.018899,0.085748,...,0,1,0,0,0,0,0,0,1,0
25589,0.088608,0.296296,0.2,0.2,0.2,0.2,0.2,0.2,0.110553,0.175698,...,0,1,0,0,0,0,0,1,0,0
4378,0.113924,0.574074,0.3,0.4,0.4,0.4,0.2,0.2,0.047156,0.111403,...,1,0,0,0,0,0,0,1,0,0
8604,0.101266,0.481481,0.2,0.2,0.2,0.2,0.2,0.2,0.056310,0.121039,...,0,1,0,0,0,0,0,1,0,0


In [309]:
y_train = original_df["default_payment_next_month"]

In [310]:
y_train

ID
25150    0
13470    0
3092     0
13973    1
10567    0
        ..
6283     0
25589    0
4378     1
8604     0
18049    1
Name: default_payment_next_month, Length: 21000, dtype: int64

In [None]:
random_grid = {
        'booster' : ['gbtree','dart'],
        'n_estimators' : [300, 350, 500, 650, 800],
        'max_depth': [3, 4, 5, 6, 8, 10],
        'learning_rate' : [0.005, 0.01, 0.02, 0.03, 0.05, 0.07],
        'tree_method' : ['auto', 'exact', 'approx']
        }

In [None]:
xgb = XGBClassifier()

xgb_grid = GridSearchCV(estimator = xgb, param_grid = random_grid,
                                n_jobs=-1)
# Fit the random search model
xgb_grid.fit(X_train, y_train)

In [None]:
learning_rate = xgb_grid.best_params_["learning_rate"]
n_estimators = xgb_grid.best_params_["n_estimators"]
max_depth = xgb_grid.best_params_["max_depth"]
tree_method = xgb_grid.best_params_["tree_method"]
booster = xgb_grid.best_params_["booster"]

In [311]:
# learning_rate = 0.03
# n_estimators = 1000
# max_depth = 5
#instantiate model and train
model = XGBClassifier(learning_rate = learning_rate, n_estimators=n_estimators, max_depth=max_depth, booster = booster,
                      tree_method = tree_method, n_jobs=-1)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [312]:
predictions = model.predict(test_df)

In [313]:
output_df = pd.DataFrame({'ID': test_df.index, 'default_payment_next_month': predictions})

In [314]:
output_df.to_csv("solution.csv", index=False)