In [None]:
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.model_selection import StratifiedKFold
from pandas.api.types import is_numeric_dtype

In [None]:
%%time
train = pd.read_feather('../input/amexfeather/train_data.ftr')
train_df = train.groupby('customer_ID').tail(1).set_index('customer_ID')

### Creating Aggregated features

In [None]:
num_features = [col for col in train.columns if is_numeric_dtype(train[col])]
agg_feature_names = [f'{feat}_mean' for feat in num_features]
num_feats_agg_mean = train.groupby('customer_ID')[num_features].agg('mean')
train_df = train_df.merge(num_feats_agg_mean, on='customer_ID')

In [None]:
train_df.shape

### Renaming Features

In [None]:
var_naming_dict = {'D':'Delinquency','S':'Spend','P':'Payment','B':'Balance','R':'Risk'}
def rename_columns(data):
    old_names = [col for col in data.columns if col not in ['customer_ID','target']]
    new_names = [f'{var_naming_dict[col[0]]}{col[1:]}' for col in data.columns if col not in ['customer_ID','target']]
    rename_dict = dict(zip(old_names,new_names))
    data.rename(rename_dict, inplace=True, axis=1)
    return data

In [None]:
train_df = rename_columns(train_df)

### Dropping features with high missing rate

In [None]:
%%time
missing_vals = train_df.isnull().sum().reset_index()
missing_vals.columns = ['Column','Count']
missing_vals['Missing %'] = np.round(missing_vals['Count'] / train_df.shape[0] * 100,2)
missing_vals.sort_values(by='Missing %', ascending=False)

In [None]:
high_miss_cols = missing_vals[missing_vals['Missing %'] > 95]['Column'].values.tolist()

In [None]:
train_df = train_df.drop(high_miss_cols, axis=1)

In [None]:
print(f"Dropped columns: {high_miss_cols}")

### Object to Category for LGB

In [None]:
def object_to_cat(data):
    for col in data.columns:
        if str(data[col].dtype) == 'object':
            data[col] = data[col].astype('category')
    return data

In [None]:
%%time
train_df = object_to_cat(train_df)
train_df = train_df.drop('Spend_2', axis=1)

In [None]:
y = train_df['target'] 
X = train_df.drop('target', axis=1)

## Loading and transforming test

In [None]:
test_df = pd.read_feather('../input/amexfeather/test_data.ftr')
test_df = test_df.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()
test_df = rename_columns(test_df)
test_df = test_df.drop(high_miss_cols, axis=1)
test_df = object_to_cat(test_df)
test_df = test_df.drop('Spend_2', axis=1)

### K-Fold Model Training

In [None]:
gbm_test_preds =[]
sk_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(sk_fold.split(X, y)):
    
    print("\nFold {}".format(fold+1))
    X_train, y_train = X.iloc[train_idx,:], y[train_idx]
    X_val, y_val = X.iloc[val_idx,:], y[val_idx]
    print("Train shape: {}, {}, Valid shape: {}, {}\n".format(
        X_train.shape, y_train.shape, X_val.shape, y_val.shape))
    
    params = {'boosting_type': 'gbdt',
              'n_estimators': 2000,
              'num_leaves': 80,
              'learning_rate': 0.05,
              'feature_fraction': 1,
              'bagging_fraction': 1,
              'max_depth':7,
              'is_unbalance':True,
              'objective': 'binary',
              'random_state': 42}
    
    gbm = LGBMClassifier(**params).fit(X_train, y_train,
                                       eval_set=[(X_train, y_train), (X_val, y_val)],
                                       callbacks=[early_stopping(50), log_evaluation(500)],
                                       eval_metric=['auc','binary_logloss'])
    gbm_test_preds.append(gbm.predict_proba(test_df)[:,1])
    
    del X_train, y_train, X_val, y_val
    _ = gc.collect()
    
del X,y
gc.collect()

In [None]:
sub = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")
sub['prediction']=np.mean(gbm_test_preds, axis=0)
sub.to_csv('submission.csv', index=False)

In [None]:
sub