In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<p style="font-family: Arials; font-size: 20px;text-align: center;; font-style: normal;line-height:1.3">The objective of this competition is to predict the probability that a customer does not pay back their credit card balance amount in the future based on their monthly customer profile. </p>

The dataset contains aggregated profile features for each customer at each statement date. Features are anonymized and normalized, and fall into the following general categories:

- `D_*` = Delinquency variables
- `S_*` = Spend variables
- `P_*` = Payment variables
- `B_*` = Balance variables
- `R_*` = Risk variables

With the following features being categorical:`B_30`,`B_38`,`D_114`,`D_116`,`D_117`,`D_120`,`D_126`,`D_63`,`D_64`, `D_66`,`D_68`


Your task is to predict, for each customer_ID, the probability of a future payment default (target = 1).

Thanks to: https://www.kaggle.com/code/ripcurl/amex-eda-default-prediction/edit

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from itertools import cycle

import warnings, gc
warnings.filterwarnings('ignore')

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score,confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
df_tr_lab = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
df_tr_lab.head()

In [None]:
df_train = pd.read_csv('../input/amex-default-prediction/train_data.csv',
                       nrows=30000)

In [None]:
df_train.head()

In [None]:
df_train.select_dtypes(include=np.object).head()

In [None]:
df_train.D_63.value_counts()

In [None]:
pd.crosstab(df_train.D_63, df_tr_lab.iloc[:30000].target)

In [None]:
pd.crosstab(df_train.D_63, df_tr_lab.iloc[:30000].target).plot(kind='bar')
pd.crosstab(df_train.D_63, df_tr_lab.iloc[:30000].target).plot(kind='kde')

## convert categorical variable to "dummies"
* first fill null values

In [None]:
df_train['D_63'] = df_train['D_63'].fillna('CQ')

In [None]:
df_train = pd.get_dummies(df_train, columns=['D_63'])
df_train.head()

In [None]:
df_train.D_64.value_counts()

* change -1 value to O (most common value)
* fill null values with 'O' most common value

In [None]:
df_train['D_64'] = np.where(df_train['D_64']=='-1', 'O', df_train['D_64'])
df_train['D_64'] = df_train['D_64'].fillna('O')
df_train.D_64.value_counts()

Get dummies for categorical value

In [None]:
df_train = pd.get_dummies(df_train, columns=['D_64'])
df_train.head()

drop for now the date column

In [None]:
df_train = df_train.drop(['S_2'], axis=1)

### fill missing values

In [None]:
df_mean = df_train.mean()
df_mean['P_2']

In [None]:
df_train.head()

In [None]:
for c in df_train.columns[1:]:
    df_train[c] = df_train[c].fillna(df_mean[c])

## build a model

In [None]:
X_train = df_train.values[:, 1:]
Y_train = df_tr_lab['target'].values[0:30000]
X_train.shape, Y_train.shape

In [None]:
# Logistic Regression

logreg = LogisticRegression(max_iter=300, solver='liblinear')

logreg.fit(X_train, Y_train)

Y_train_pred = logreg.predict(X_train)

# score - Return the mean accuracy on the given test data and labels.
logreg.score(X_train, Y_train)

In [None]:
print('acc: ', accuracy_score(Y_train, Y_train_pred))
confusion_matrix(Y_train, Y_train_pred)

#### Test the model
as test we will use the next 10000

In [None]:
del df_train
gc.collect()

In [None]:
df_val = pd.read_csv('../input/amex-default-prediction/train_data.csv',
                      nrows=30000, skiprows=range(1,-30000))
df_val.shape

In [None]:
df_val.head()

#### process validation data same pipeline as train

In [None]:

df_val['D_63'] = df_val['D_63'].fillna('CQ')
df_val = pd.get_dummies(df_val, columns=['D_63'])
df_val['D_64'] = np.where(df_val['D_64']=='-1', 'O', df_val['D_64'])
df_val['D_64'] = df_val['D_64'].fillna('O')
df_val = pd.get_dummies(df_val, columns=['D_64'])
df_val = df_val.drop(['S_2'], axis=1)
for c in df_val.columns[1:]:
    df_val[c] = df_val[c].fillna(df_mean[c])

In [None]:
X_val = df_val.values[:, 1:]
Y_val = df_tr_lab['target'].values[-30000:]
X_val.shape, Y_val.shape

In [None]:
Y_pred = logreg.predict(X_val)

In [None]:
Y_pred_lg_prob = logreg.predict_proba(X_val)
Y_pred_lg_prob[0:5]

In [None]:
roc_auc_score(Y_val, Y_pred_lg_prob[:,1])

In [None]:
confusion_matrix(Y_val, Y_pred)

In [None]:
accuracy_score(Y_val, Y_pred)

In [None]:
print(classification_report(Y_val, Y_pred))

## Lets try and to better
### KNN

In [None]:
knn15 = KNeighborsClassifier(15).fit(X_train, Y_train)
Y_train_pred = knn15.predict(X_train)

# score - Return the mean accuracy on the given test data and labels.
knn15.score(X_train, Y_train)

In [None]:
print('acc: ', accuracy_score(Y_train, Y_train_pred))
confusion_matrix(Y_train, Y_train_pred)

In [None]:
Y_pred = knn15.predict(X_val)

In [None]:
print(confusion_matrix(Y_val, Y_pred))
print(accuracy_score(Y_val, Y_pred))
print(classification_report(Y_val, Y_pred))

In [None]:
Y_pred_knn_prob = knn15.predict_proba(X_val)
Y_pred_knn_prob[0:5]

In [None]:
roc_auc_score(Y_val, Y_pred_knn_prob[:,1])

## And the king: RandomForest
* next week we will learn about it

In [None]:
rf = RandomForestClassifier(6).fit(X_train, Y_train)
Y_train_pred = knn15.predict(X_train)

# score - Return the mean accuracy on the given test data and labels.
print(rf.score(X_train, Y_train))
print(confusion_matrix( Y_train, Y_train_pred))
print(accuracy_score( Y_train, Y_train_pred))
print(classification_report( Y_train, Y_train_pred))

In [None]:
Y_pred = rf.predict(X_val)
print('Test confusion matrix:\n',confusion_matrix(Y_val, Y_pred))
print('Test acc: ',accuracy_score(Y_val, Y_pred))
print(classification_report(Y_val, Y_pred))

In [None]:
Y_pred_rf_prob_train = rf.predict_proba(X_train)
Y_pred_rf_prob_train[0:5]

In [None]:
'auc train: ', roc_auc_score(Y_train, Y_pred_rf_prob_train[:,1])

In [None]:
Y_pred_rf_prob = rf.predict_proba(X_val)
Y_pred_rf_prob[0:5]

In [None]:
roc_auc_score(Y_train, Y_pred_rf_prob[:,1])

### ensemble the models

In [None]:
Y_pred_ensemble_p = (Y_pred_lg_prob+Y_pred_knn_prob+Y_pred_rf_prob)/3.

In [None]:
Y_pred_ensemble = np.argmax(Y_pred_ensemble_p, axis=1)
Y_pred_ensemble[0:5]

In [None]:
'auc test:', roc_auc_score(Y_val, Y_pred_ensemble_p[:,1])

In [None]:
print(confusion_matrix(Y_val, Y_pred_ensemble))
print('test acc: ',accuracy_score(Y_val, Y_pred_ensemble))
print(classification_report(Y_val, Y_pred_ensemble))

# Lets resample the data

In [None]:
def preprocess(df):
    df['D_63'] = df['D_63'].fillna('CQ')
    df = pd.get_dummies(df, columns=['D_63'])
    df['D_64'] = np.where(df['D_64']=='-1', 'O', df['D_64'])
    df['D_64'] = df['D_64'].fillna('O')
    df = pd.get_dummies(df, columns=['D_64'])
    df = df.drop(['S_2'], axis=1)
    
    for c in df.columns[1:]:
        df[c] = df[c].fillna(df_mean[c])
    return df

In [None]:
df_tr_lab.head()

In [None]:
df_train = pd.read_csv('../input/amex-default-prediction/train_data.csv',
                       nrows=30000)
df_train['target'] = df_tr_lab['target'].values[:30000]

In [None]:
k=0
df_tmp = pd.read_csv('../input/amex-default-prediction/train_data.csv',
                       nrows=10000, skiprows=(1, 30000+k*10000))
df_tmp['target'] = df_tr_lab['target'].values[30000+(k)*10000:30000+(k+1)*10000]
df_train = pd.concat([df_train, df_tmp], axis=0)

In [None]:
df_train.shape

In [None]:

for k in range(5):
    df_tmp = pd.read_csv('../input/amex-default-prediction/train_data.csv',
                       nrows=10000, skiprows=(1, 30000+k*10000))
    df_tmp['target'] = df_tr_lab['target'].values[30000+(k)*10000:30000+(k+1)*10000]
    
    df_train = pd.concat([df_train, df_tmp[df_tmp.target==1]], axis=0)
    print(df_train.shape, df_train.target.sum())                    

In [None]:
df_train.shape, df_train.target.value_counts(),df_train[df_train.target==1].sum()

In [None]:
df_train.target.value_counts()

In [None]:

df_train = pd.concat([preprocess(df_train.drop('target', axis=1)),
                      df_train['target']], axis=1)

In [None]:
X_train = df_train.values[:, 1:-1]
Y_train = df_train['target'].values
X_train.shape, Y_train.shape

In [None]:
# logistic regression

logreg = LogisticRegression(max_iter=1000, solver='liblinear')

logreg.fit(X_train, Y_train)

Y_train_pred = logreg.predict(X_train)

# score - Return the mean accuracy on the given test data and labels.
logreg.score(X_train, Y_train)

In [None]:
print('train acc: ', accuracy_score(Y_train, Y_train_pred))
confusion_matrix(Y_train, Y_train_pred)

In [None]:
Y_pred = logreg.predict(X_val)

In [None]:
print('test acc: ', accuracy_score(Y_val, Y_pred))
confusion_matrix(Y_val, Y_pred)

In [None]:
Y_pred_lg_prob = logreg.predict_proba(X_val)
Y_pred_knn_prob[0:5]

In [None]:
roc_auc_score(Y_val, Y_pred_lg_prob[:,1])

## Predict for test and submit

In [None]:
df_subm = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")
df_subm.head()

In [None]:
len(df_subm)//30000

In [None]:
pred = []
ind = 0
for k in range(31):
    start_idx = k*30000
    end_idx = min((k+1)*30000, len(df_subm))
    df_test = pd.read_csv('../input/amex-default-prediction/test_data.csv',
                          nrows=end_idx-start_idx, 
                          skiprows=(1+k*30000, end_idx))
    df_test = preprocess(df_test)
    print(df_test.shape)
    X = df_test.values[:, 1:]
    pred_n = logreg.predict_proba(X)[:,1]
    print(pred_n.shape)
    pred += list(pred_n)            
    

In [None]:
X.shape

In [None]:
df_subm['prediction'] = pred

In [None]:
df_subm.head()

In [None]:
df_subm.columns

In [None]:
df_subm.to_csv('submission.csv', index = False)