# Initial notebook with some simple ideas

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
%matplotlib inline

input_path = Path('/kaggle/input/amex-default-prediction/')

In [None]:
# reference https://www.kaggle.com/competitions/amex-default-prediction/discussion/327162
def amex_metric(y_true: pd.Series, y_pred: pd.Series) -> float:

    def top_four_percent_captured(df) -> float:
        
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(df) -> float:
        
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true, df) -> float:
        y_true_pred = y_true.rename('prediction')
        true_df = pd.concat([y_true, y_true_pred], axis='columns').sort_values('prediction', ascending=False)
        return weighted_gini(df) / weighted_gini(true_df)

    df = pd.DataFrame({'target': y_true, 'prediction': y_pred}).sort_values('prediction', ascending=False)
    g = normalized_weighted_gini(y_true, df.copy())
    d = top_four_percent_captured(df.copy())

    return 0.5 * (g + d)

In [None]:
train_data = pd.read_csv(
    input_path / 'train_data.csv',
    index_col='customer_ID',
    nrows=1_000_000)

train_labels = pd.read_csv(input_path / 'train_labels.csv', index_col='customer_ID', nrows=1_000_000)

In [None]:
# get only the labels of the customers of the first 1M rows of the train data
train_labels = train_labels[train_labels.index.isin(train_data.index)]

## What is the single best predictor?

In [None]:
# We are going to use only the last month
last_month_train_data = train_data.groupby('customer_ID').tail(1)

In [None]:
final_data = last_month_train_data.merge(train_labels, on='customer_ID', how='inner', validate='one_to_one')

In [None]:
final_data.corr()['target'].abs().sort_values(ascending=False)

## Analyzing P_2

In [None]:
sns.kdeplot(final_data.loc[final_data['target'].eq(0), 'P_2'])
sns.kdeplot(final_data.loc[final_data['target'].eq(1), 'P_2'])

We can use a simple heuristic and say that someone who has less than 0.5 P_2 has target = 0, how this algorithm will work?

In [None]:
final_data['target_test_P_2'] = final_data['P_2'].lt(0.5)

In [None]:
amex_metric(final_data['target'],
            final_data['target_test_P_2'])

## Analyzing D_48

In [None]:
sns.kdeplot(final_data.loc[final_data['target'].eq(0), 'D_48'])
sns.kdeplot(final_data.loc[final_data['target'].eq(1), 'D_48'])

plt.legend(['target 0', 'target 1'])

In [None]:
# There are just some values bigger than 1, very strange
final_data['D_48'].ge(1).sum()

In [None]:
# Redo chart without outliers
tmp = final_data.loc[final_data['D_48'].le(1)]
sns.kdeplot(tmp.loc[tmp['target'].eq(0), 'D_48'])
sns.kdeplot(tmp.loc[tmp['target'].eq(1), 'D_48'])

Let's try to use another simple heuristic, if a value has D_48 < 0.45 then it is target = 0

In [None]:
final_data['target_test_D_48'] = final_data['D_48'].ge(0.45)

In [None]:
amex_metric(final_data['target'],
            final_data['target_test_D_48'])

## Combining P_2 and D_48

In [None]:
tmp = final_data.loc[final_data['D_48'].le(1)]
sns.kdeplot(x=tmp['P_2'], y=tmp['D_48'], hue=tmp['target'])

We can see there is a clear separation in the data, let's try to create a simple logistic regression model using just P_2 and D_48 on the last month

## Initial model

In [None]:
train_data = pd.read_csv(
    input_path / 'train_data.csv',
    usecols=['P_2', 'D_48', 'customer_ID'])

train_labels = pd.read_csv(input_path / 'train_labels.csv', index_col='customer_ID')

In [None]:
last_month_train_data = train_data.groupby('customer_ID').tail(1)

In [None]:
last_month_train_data = last_month_train_data.merge(train_labels, on='customer_ID', how='inner',
                                                    validate='one_to_one')

In [None]:
# # over sample
# max_size = last_month_train_data['target'].value_counts().max()
# lst = [last_month_train_data]
# for class_index, group in last_month_train_data.groupby('target'):
#     lst.append(group.sample(max_size-len(group), replace=True))
# last_month_train_data_over_sampled = pd.concat(lst)

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(last_month_train_data[['P_2', 'D_48']].fillna(-999), 
       last_month_train_data['target'])

In [None]:
test_data = pd.read_csv(
    input_path / 'test_data.csv',
    usecols=['P_2', 'D_48', 'customer_ID'])

In [None]:
last_month_test_data = test_data.groupby('customer_ID').tail(1)

In [None]:
y_pred = lr.predict(last_month_test_data[['P_2', 'D_48']].fillna(-999))

In [None]:
last_month_test_data['prediction'] = y_pred

In [None]:
last_month_test_data['prediction'].mean()

In [None]:
last_month_test_data[['customer_ID', 'prediction']].to_csv('inicial_submission.csv', index=False)

In [None]:
rf = RandomForestRegressor()

In [None]:
rf.fit(last_month_train_data[['P_2', 'D_48']].fillna(-999), 
       last_month_train_data['target'])

In [None]:
y_pred = rf.predict(last_month_test_data[['P_2', 'D_48']].fillna(-999))

In [None]:
last_month_test_data['prediction'] = y_pred

In [None]:
last_month_test_data['prediction'].mean()

In [None]:
last_month_test_data[['customer_ID', 'prediction']].to_csv('submission.csv', index=False)