In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
%matplotlib inline
import gc
from sklearn.linear_model import Lasso

input_path = Path('/kaggle/input/amex-default-prediction/')

In [2]:
def amex_metric(y_true: pd.Series, y_pred: pd.Series) -> float:

    def top_four_percent_captured(df) -> float:
        
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(df) -> float:
        
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true, df) -> float:
        y_true_pred = y_true.rename('prediction')
        true_df = pd.concat([y_true, y_true_pred], axis='columns').sort_values('prediction', ascending=False)
        return weighted_gini(df) / weighted_gini(true_df)

    df = pd.DataFrame({'target': y_true, 'prediction': y_pred}).sort_values('prediction', ascending=False)
    g = normalized_weighted_gini(y_true, df.copy())
    d = top_four_percent_captured(df.copy())

    return 0.5 * (g + d)

In [4]:
train_data = pd.read_csv(
    input_path / 'train_data.csv',
    index_col='customer_ID',
    nrows=1_000_000)

train_labels = pd.read_csv(input_path / 'train_labels.csv', index_col='customer_ID', nrows=1_000_000)

In [5]:
display(train_data.head())
print('\n')
print('*'*50)
print('\n')
display(train_labels.head())

Unnamed: 0_level_0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,,...,,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,,...,,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,,...,,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,,...,,,,0.006117,0.004516,0.0032,,0.008419,0.006527,0.0096
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,,...,,,,0.003671,0.004946,0.008889,,0.00167,0.008126,0.009827




**************************************************




Unnamed: 0_level_0,target
customer_ID,Unnamed: 1_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,0


In [None]:
train_labels = train_labels[train_labels.index.isin(train_data.index)]

In [None]:
df_last_month = train_data.groupby('customer_ID').tail(1)

In [None]:
print('Last Month Data of each customer')
display(df_last_month.head())

In [None]:
final_data = df_last_month.merge(train_labels, how = 'inner', on = 'customer_ID', validate = 'one_to_one')
print('Final Data after mergingx')
display(final_data.head())

In [None]:
display(final_data.corr()['target'].abs().sort_values(ascending = False).head(10))

In [None]:
sns.kdeplot(final_data.loc[final_data['target'].eq(0), 'P_2'])
sns.kdeplot(final_data.loc[final_data['target'].eq(1), 'P_2']) 

In [None]:
sns.kdeplot(final_data.loc[final_data['target'].eq(0), 'D_48'])
sns.kdeplot(final_data.loc[final_data['target'].eq(1), 'D_48'])

In [None]:
use_cols = ['P_2', 'D_48', 'B_2', 'D_61', 'B_18', 'D_55', 'B_9', 'D_44', 'B_33', 'customer_ID']
train_data = pd.read_csv(input_path / 'train_data.csv', usecols=use_cols)
train_labels = pd.read_csv(input_path / 'train_labels.csv', index_col='customer_ID')

In [None]:
train_data

In [None]:
last_month_df = train_data.groupby('customer_ID').tail(1).fillna(-999)

In [None]:
last_month_train_df = last_month_df.merge(train_labels, how = 'inner', on = 'customer_ID')

In [None]:
test_data = pd.read_csv(input_path / 'test_data.csv', usecols = use_cols)
last_month_test_df = test_data.groupby('customer_ID').tail(1)
traincols = ['P_2', 'D_48', 'B_2', 'D_61', 'B_18', 'D_55', 'B_9', 'D_44', 'B_33']

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfclassifier = RandomForestClassifier()
rfclassifier.fit(last_month_train_df[traincols], last_month_train_df['target'])

In [None]:
y_pred_rfc = rfclassifier.predict_proba(last_month_test_df[traincols].fillna(-999))[:,1]

In [None]:
last_month_test_df['prediction'] = y_pred_rfc
last_month_test_df['prediction'].mean()

In [None]:
last_month_test_df[['customer_ID', 'prediction']]