In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
%matplotlib inline
import gc
from sklearn.linear_model import Lasso

input_path = Path('/kaggle/input/amex-default-prediction/')

In [None]:
def amex_metric(y_true: pd.Series, y_pred: pd.Series) -> float:

    def top_four_percent_captured(df) -> float:
        
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(df) -> float:
        
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true, df) -> float:
        y_true_pred = y_true.rename('prediction')
        true_df = pd.concat([y_true, y_true_pred], axis='columns').sort_values('prediction', ascending=False)
        return weighted_gini(df) / weighted_gini(true_df)

    df = pd.DataFrame({'target': y_true, 'prediction': y_pred}).sort_values('prediction', ascending=False)
    g = normalized_weighted_gini(y_true, df.copy())
    d = top_four_percent_captured(df.copy())

    return 0.5 * (g + d)

In [None]:
train_data = pd.read_csv(
    input_path / 'train_data.csv',
    index_col='customer_ID',
    nrows=1_000_000)

train_labels = pd.read_csv(input_path / 'train_labels.csv', index_col='customer_ID', nrows=1_000_000)

In [None]:
display(train_data.head())
print('\n')
print('*'*50)
print('\n')
display(train_labels.head())

In [None]:
train_labels = train_labels[train_labels.index.isin(train_data.index)]

In [None]:
df_last_month = train_data.groupby('customer_ID').tail(1)

In [None]:
print('Last Month Data of each customer')
display(df_last_month.head())

In [None]:
final_data = df_last_month.merge(train_labels, how = 'inner', on = 'customer_ID', validate = 'one_to_one')
print('Final Data after mergingx')
display(final_data.head())

In [None]:
display(final_data.corr()['target'].abs().sort_values(ascending = False).head(10))

In [None]:
sns.kdeplot(final_data.loc[final_data['target'].eq(0), 'P_2'])
sns.kdeplot(final_data.loc[final_data['target'].eq(1), 'P_2']) 

In [None]:
sns.kdeplot(final_data.loc[final_data['target'].eq(0), 'D_48'])
sns.kdeplot(final_data.loc[final_data['target'].eq(1), 'D_48'])

In [None]:
use_cols = ['P_2', 'D_48', 'B_2', 'D_61', 'B_18', 'D_55', 'B_9', 'D_44', 'B_33', 'customer_ID']
train_data = pd.read_csv(input_path / 'train_data.csv', usecols=use_cols)
train_labels = pd.read_csv(input_path / 'train_labels.csv', index_col='customer_ID')

In [None]:
train_data

In [None]:
last_month_df = train_data.groupby('customer_ID').tail(1).fillna(-999)

In [None]:
last_month_train_df = last_month_df.merge(train_labels, how = 'inner', on = 'customer_ID')

In [None]:
test_data = pd.read_csv(input_path / 'test_data.csv', usecols = use_cols)
last_month_test_df = test_data.groupby('customer_ID').tail(1)
traincols = ['P_2', 'D_48', 'B_2', 'D_61', 'B_18', 'D_55', 'B_9', 'D_44', 'B_33']

In [None]:
import xgboost
xgb_model=xgboost.XGBClassifier(n_estimators=200,max_depth=3,learning_rate=0.15, subsample=0.5)
xgb_model.fit(last_month_train_df[traincols], np.ravel(last_month_train_df['target'], order='C'))

In [None]:
y_pred_rfc = xgb_model.predict_proba(last_month_test_df[traincols].fillna(-999))[:,1]

In [None]:
last_month_test_df['prediction'] = y_pred_rfc
last_month_test_df['prediction'].mean()

In [None]:
last_month_test_df[['customer_ID', 'prediction']].to_csv('submission.csv', index=False)