In [1]:
import gc
import datetime
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgbm
from lightgbm import early_stopping

warnings.filterwarnings('ignore')

In [2]:
NUM_FOLDS = 5

In [3]:
FEATURES = [
    'P_2', 'B_1', 'B_2', 'R_1', 'S_3', 'B_3', 'D_44', 'B_4', 'D_45', 'B_5',
    'D_47', 'D_48', 'B_6', 'B_7', 'B_8', 'B_9', 'D_52', 'P_3', 'B_11', 'S_7',
    'D_55', 'D_61', 'B_18', 'B_23', 'D_75', 'B_33', 'S_23', 'S_25', 'B_37',
    'R_27', 'B_40', 'c_PD_239', 'c_PB_29', 'c_PR_21', 'c_BBBB2', 'c_RRR0',
    'c_PD_348', 'c_PB_49', 'c_PR_41'
]

In [4]:
def features_process(df, label=None, test=False):
    global FEATURES
    def feature_select(df):
        X = df.drop(['customer_ID', 'S_2', 'target'], axis=1).fillna(-127)
        y = df['target']
        select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1, verbose=1), threshold='1.2*mean')
        select.fit(X, y)
        #   
        # features = select.get_feature_names_out()
        FEATURES = list(X.columns[select.get_support()])
        
        print(FEATURES)
        df = df[FEATURES + ['target']]
        
        return df
    
    def polynomial_feature(df):
        categorical_features = [
            'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
            'D_126', 'D_63', 'D_64', 'D_66', 'D_68'
        ]
        num_features = []
        for col in [c for c in df.columns if c not in ['customer_ID', 'S_2', 'target']]:
            if col not in categorical_features:
                num_features.append(col)
                
        poly = PolynomialFeatures(degree=2, include_bias=False)
        df_poly = poly.fit_transform(df[num_features].fillna(-127))
        df[poly.get_feature_names(None)] = df_poly
        
        del df_poly
        return df
        

    
    print('groupby customer_ID tail(1)')
    df = df.groupby('customer_ID').tail(1).set_index('customer_ID')
    print('shape:', df.shape)

    if not test:
        print('dropna nan >= 80%')
        df = df.dropna(axis=1, thresh=int(0.8 * len(df)))
        print('shape:', df.shape)

    print('add features')
    df["c_PD_239"] = df["D_39"] / (df["P_2"] * (-1) + 0.0001)
    df["c_PB_29"] = df["P_2"] * (-1) / (df["B_9"] * (1) + 0.0001)
    df["c_PR_21"] = df["P_2"] * (-1) / (df["R_1"] + 0.0001)

    df["c_BBBB"] = (df["B_9"] + 0.001) / (df["B_23"] + df["B_3"] + 0.0001)
    df["c_BBBB1"] = (df["B_33"] * (-1)) + (df["B_18"] * (-1) + df["S_25"] * (1) + 0.0001)
    df["c_BBBB2"] = (df["B_19"] + df["B_20"] + df["B_4"] + 0.0001)

    df["c_RRR0"] = (df["R_3"] + 0.001) / (df["R_2"] + df["R_4"] + 0.0001)
    df["c_RRR1"] = (df["D_62"] + 0.001) / (df["D_112"] + df["R_27"] + 0.0001)

    df["c_PD_348"] = df["D_48"] / (df["P_3"] + 0.0001)
    df["c_PD_355"] = df["D_55"] / (df["P_3"] + 0.0001)

    df["c_PD_439"] = df["D_39"] / (df["P_4"] + 0.0001)
    df["c_PB_49"] = df["B_9"] / (df["P_4"] + 0.0001)
    df["c_PR_41"] = df["R_1"] / (df["P_4"] + 0.0001)
    print('shape:', df.shape)
    
    if label is not None:
        print('merge with label')
        df = df.merge(label, how='left', on='customer_ID')
        del label
        print('shape:', df.shape)
    
    print('feature select')
    if not test:
        # df = feature_select(df)
        df = df[FEATURES + ['target']]
    else:
        df = df[FEATURES]
    print('shape:', df.shape)
    
    print('polynomial features')
    df = polynomial_feature(df)
    print('shape:', df.shape)
    
    
    return df

In [5]:
%%time

df_train = pd.read_parquet('./data/train.parquet')
df_train_label = pd.read_csv('./data/train_labels.csv')
df_train = features_process(df_train, label=df_train_label)

del df_train_label
gc.collect()

groupby customer_ID tail(1)
shape: (458913, 189)
dropna nan >= 80%
shape: (458913, 168)
add features
shape: (458913, 181)
merge with label
shape: (458913, 183)
feature select
shape: (458913, 40)
polynomial features
shape: (458913, 859)
Wall time: 1min 58s


0

In [6]:
df_train

Unnamed: 0,P_2,B_1,B_2,R_1,S_3,B_3,D_44,B_4,D_45,B_5,...,x35^2,x35 x36,x35 x37,x35 x38,x36^2,x36 x37,x36 x38,x37^2,x37 x38,x38^2
0,0.934745,0.009382,1.007647,0.006104,0.135021,0.007174,0,5,0.740102,0.231717,...,1.000000e+02,3.056054e+00,9.534851e+02,6.103754e+02,0.093395,29.139024,18.653404,9.091339e+03,5819.838580,3725.581342
1,0.880519,0.034684,1.004028,0.006911,0.165509,0.005068,0,1,0.266275,0.027000,...,1.002001e+08,2.576277e+02,1.293924e+06,6.918065e+05,0.000662,3.326849,1.778726,1.670895e+04,8933.571582,4776.405203
2,0.880875,0.004284,0.812649,0.006450,,0.007196,0,2,0.251598,0.001557,...,1.000000e+02,1.277662e+00,9.392085e+02,6.449954e+02,0.016324,11.999909,8.240861,8.821125e+03,6057.851511,4160.190949
3,0.621776,0.012564,1.006183,0.007829,0.287766,0.009937,0,0,0.085103,0.118818,...,4.004001e+08,-2.541270e+06,4.107289e+06,1.566539e+06,16129.000000,-26068.253784,-9942.549126,4.213242e+04,16069.495559,6128.977811
4,0.871900,0.007679,0.815746,0.001247,,0.005528,0,21,0.069952,0.004855,...,1.000000e+02,5.698698e+00,4.027262e+02,1.247313e+02,0.324752,22.950149,7.108059,1.621884e+03,502.325571,155.578930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458908,0.844229,0.028515,1.009866,0.001928,0.128707,0.005893,0,4,0.073120,0.040532,...,4.901400e+09,7.728978e+03,1.191870e+08,1.349820e+06,0.012188,187.944949,2.128520,2.898260e+06,32823.464700,371.733357
458909,0.831279,0.292360,0.055656,0.006953,,0.233078,1,19,0.618023,0.018681,...,1.002001e+08,3.048453e+03,1.433499e+07,6.960385e+05,0.092745,436.122880,21.176033,2.050817e+06,99577.827386,4835.021243
458910,0.800522,0.020563,1.007023,0.000957,0.066648,0.006314,0,6,0.133731,0.019537,...,1.000000e+02,1.856966e+00,4.645642e+03,9.573463e+01,0.034483,86.267975,1.777759,2.158199e+05,4447.488178,91.651202
458911,0.754129,0.015838,0.714486,0.000993,0.408849,0.050048,1,18,0.070383,0.020531,...,1.000000e+02,7.178788e+00,1.649133e+03,9.926709e+01,0.515350,118.387732,7.126174,2.719638e+04,1637.045909,98.539555


In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458913 entries, 0 to 458912
Columns: 859 entries, P_2 to x38^2
dtypes: float32(33), float64(821), int16(1), int64(1), int8(3)
memory usage: 2.9 GB


In [9]:
categorical_cols = [
    'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
    'D_126', 'D_63', 'D_64', 'D_66', 'D_68'
]

cat_col = []
categorical_cols_ = []
n = 0
for col in df_train.columns:
    for coll in categorical_cols:
        if col == coll:
            cat_col.append(n)
            categorical_cols_.append(col)
            break
    n += 1
print(cat_col)
print(categorical_cols_)

[]
[]


In [7]:
# https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793/notebook
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):
    
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1] / gini[0] + top_four), _

In [10]:
%%time
params = {}
feature_importances = []  # 특성 중요도 
scores = []               # fold 별 점수 
models = []               # 모델 
pred_val = []
yval = []

X = df_train.drop(['target'], axis=1)
y = df_train['target']

# 교차 검증 클래스
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2022)

# 폴드별 데이터 나누기 
for fold, (train_idx, val_idx) in enumerate(skf.split(df_train[FEATURES], df_train["target"])):
    
    print('FOLD:', fold)
    
    # 데이터 나누기
    X_train = X.loc[train_idx].values
    y_train = y.loc[train_idx].values
    X_val = X.loc[val_idx].values
    y_val = y.loc[val_idx].values

    print("y_train t=0 count:", len(y_train[y_train == 0]))
    print("y_train t=1 count:", len(y_train[y_train == 1]))
    print("y_val t=0 count:", len(y_val[y_val == 0]))
    print("y_val t=1 count:", len(y_val[y_val == 1]))


    params = {
        "num_iterations": 10000,
        'learning_rate': 0.05,
    }
    
    # LGBM 알고리즘
    model = lgbm.LGBMClassifier(**params).fit(
        X_train,y_train,
        eval_set=[(X_val, y_val), (X_train, y_train)],
        verbose=100,
        callbacks=[early_stopping(100)],
        categorical_feature=cat_col
    )
    
    # 특성 중요도
    feature_importances.append(model.feature_importances_)   
    models.append(model)
    pred_val = np.append(pred_val, model.predict_proba(X_val)[:, 1])
    yval = np.append(yval, y_val)   
    
    del X_train, y_train, X_val, y_val, model
    gc.collect()


score = amex_metric_mod(yval, pred_val)[0]
print('score:', score)
with open('score_lightgbm_poly.txt', 'w') as f:
    f.write(str(score))
# f = open("lightgbm_score.txt", "a")
# f.write(str(score))
# f.close()

FOLD: 0
y_train t=0 count: 272068
y_train t=1 count: 95062
y_val t=0 count: 68017
y_val t=1 count: 23766
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.234048	valid_0's binary_logloss: 0.241011
[200]	training's binary_logloss: 0.224241	valid_0's binary_logloss: 0.237228
[300]	training's binary_logloss: 0.217793	valid_0's binary_logloss: 0.236783
[400]	training's binary_logloss: 0.212369	valid_0's binary_logloss: 0.236533
[500]	training's binary_logloss: 0.20724	valid_0's binary_logloss: 0.236447
[600]	training's binary_logloss: 0.202521	valid_0's binary_logloss: 0.23644
Early stopping, best iteration is:
[547]	training's binary_logloss: 0.204953	valid_0's binary_logloss: 0.236404
FOLD: 1
y_train t=0 count: 272068
y_train t=1 count: 95062
y_val t=0 count: 68017
y_val t=1 count: 23766
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.234124	valid_0's binary_logloss: 0.241183
[200]	training's 

In [11]:
del df_train, train_idx, val_idx, yval, pred_val, X, y
gc.collect()

20

In [None]:
df_feat_imp = pd.DataFrame(index=FEATURES)
df_feat_imp["imp0"] = feature_importances[0]
df_feat_imp["imp1"] = feature_importances[1]
df_feat_imp["imp2"] = feature_importances[2]
df_feat_imp["imp3"] = feature_importances[3]
df_feat_imp["imp4"] = feature_importances[4]
df_feat_imp["mean_imp"] = df_feat_imp.mean(axis=1).values

df_feat_imp = df_feat_imp.sort_values(by="mean_imp",ascending=False)

df_feat_imp.to_csv("feat_imp_poly.csv")

fig, ax = plt.subplots(figsize=(20,5))
sns.barplot(x=df_feat_imp.index,y=df_feat_imp["mean_imp"])
plt.xticks([])
print(df_feat_imp)

In [12]:
df_test = pd.read_parquet('./data/test.parquet')

df_test = features_process(df_test, test=True)
gc.collect()

groupby customer_ID tail(1)
shape: (924621, 189)
add features
shape: (924621, 202)
feature select
shape: (924621, 39)
polynomial features
shape: (924621, 858)


0

In [13]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 924621 entries, 00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7 to fffffa7cf7e453e1acc6a1426475d5cb9400859f82ff61cceb803ea8ec37634d
Columns: 858 entries, P_2 to x38^2
dtypes: float32(33), float64(821), int16(1), int8(3)
memory usage: 5.8+ GB


In [14]:
print("prediction")
pred = []
for fold in range(5):
    print('FOLD:', fold)

    if len(pred) == 0:
        pred = models[fold].predict_proba(df_test)[:, 1]
    else:
        pred += models[fold].predict_proba(df_test)[:, 1]

pred = pred / 5

prediction
FOLD: 0
FOLD: 1
FOLD: 2
FOLD: 3
FOLD: 4


In [15]:
subm = pd.read_csv('./data/sample_submission.csv')
subm["prediction"] = pred
subm.to_csv("submission_lightgbm_poly.csv", index=False)