In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


In [2]:

# calculate file size in KB, MB, GB
def convert_bytes(size):
    """ Convert bytes to KB, or MB or GB"""
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024.0:
            return "%3.1f %s" % (size, x)
        size /= 1024.0

# display CSV file with size
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        csvfile=os.path.join(dirname, filename)
        csvfilesize = os.path.getsize(csvfile)
        filesize = convert_bytes(csvfilesize)
        print(f'{csvfile} size is', filesize, 'bytes')

/kaggle/input/amex-default-prediction/sample_submission.csv size is 59.1 MB bytes
/kaggle/input/amex-default-prediction/train_data.csv size is 15.3 GB bytes
/kaggle/input/amex-default-prediction/test_data.csv size is 31.5 GB bytes
/kaggle/input/amex-default-prediction/train_labels.csv size is 29.3 MB bytes


In [3]:
# Importing the dataset
from pathlib import Path

input_path = Path('/kaggle/input/amex-default-prediction/')

In [4]:
import dask.dataframe as dd

In [5]:
# Loading dataset train_data.csv
train_df_sample = pd.read_csv('../input/amex-default-prediction/train_data.csv', nrows=100000)

In [6]:
# get shape of dataframe
print('Shape of dataset is:', train_df_sample.shape)

# print summary of dataframe
train_df_sample.info()

Shape of dataset is: (100000, 190)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 145.0+ MB


In [7]:
# Loading dataset train_labels.csv
train_label_df = pd.read_csv('../input/amex-default-prediction/train_labels.csv')

In [8]:
# get shape of dataframe
print('Shape of dataset is:', train_label_df.shape)

# print summary of dataframe
train_label_df.info()

Shape of dataset is: (458913, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_ID  458913 non-null  object
 1   target       458913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ MB


In [9]:
# Loading dataset test_data.csv
test_df = pd.read_csv('../input/amex-default-prediction/test_data.csv', nrows=100000, index_col='customer_ID')

In [10]:

# get shape of dataframe
print('Shape of dataset is:', test_df.shape)

# print summary of dataframe
#test_df.info(verbose=True)
test_df.info()

Shape of dataset is: (100000, 189)
<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7 to 0246c7eb137ed9b08014d66e29caf1772b0512becef11a1eda0948b8b8908576
Columns: 189 entries, S_2 to D_145
dtypes: float64(185), int64(1), object(3)
memory usage: 145.0+ MB


In [11]:
# Merge of train_df_sample and train_label_df dataframe using key as customer_ID
train_df = dd.merge(train_df_sample,train_label_df,on='customer_ID')

In [12]:
# print summary of merged dataframe
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Columns: 191 entries, customer_ID to target
dtypes: float64(185), int64(2), object(4)
memory usage: 146.5+ MB


In [13]:
# shape of train dataframe
train_df.shape

(100000, 191)

In [14]:
# Top 5 records of the data frame --> observe NaN values in the data frame
train_df.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,...,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,...,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,...,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,...,,,0.006117,0.004516,0.0032,,0.008419,0.006527,0.0096,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,...,,,0.003671,0.004946,0.008889,,0.00167,0.008126,0.009827,0


In [15]:
# Shape of test dataframe
test_df.shape

(100000, 189)

In [16]:
# Top 5 records of test data frame
test_df.head()

Unnamed: 0_level_0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,2019-02-19,0.631315,0.001912,0.010728,0.814497,0.007547,0.168651,0.009971,0.002347,0.113189,...,,,,,0.004669,,,,0.008281,
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,2019-03-25,0.587042,0.005275,0.011026,0.810848,0.001817,0.241389,0.000166,0.009132,0.123035,...,,,,0.000142,0.00494,0.009021,,0.003695,0.003753,0.00146
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,2019-04-25,0.609056,0.003326,0.01639,1.00462,0.000114,0.266976,0.004196,0.004192,0.125319,...,,,,7.4e-05,0.002114,0.004656,,0.003155,0.002156,0.006482
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,2019-05-20,0.614911,0.009065,0.021672,0.816549,0.009722,0.188947,0.004123,0.015325,0.123439,...,,,,0.004743,0.006392,0.00289,,0.006044,0.005206,0.007855
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,2019-06-15,0.591673,0.238794,0.015923,0.810456,0.002026,0.180035,0.000731,0.011281,0.122212,...,,,,0.008133,0.004329,0.008384,,0.001008,0.007421,0.009471


Note there are NaN values in the data frames

<a name = Section5></a>

---
# **Data Pre-Processing**
---

As per sweetviz report, there are no duplicate rows. Lets check missing values.

In [17]:
#Check if there are null/missing values
train_df.isnull()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,False,False
99996,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,False,False
99997,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,True,False,False,False,False
99998,False,False,False,False,False,False,False,True,False,False,...,True,True,False,False,False,False,False,False,False,False


There are missing values or Nan values

In [18]:
#drop variables with missing values >=70% in the train dataframe
i=0
for col in train_df.columns:
    if (train_df[col].isnull().sum()/len(train_df[col])*100) >=70:
        print("Column Dropped", col)
        train_df.drop(labels=col,axis=1,inplace=True)
        i=i+1
        
print("Total dropped columns are", i)

Column Dropped D_42
Column Dropped D_49
Column Dropped D_53
Column Dropped D_66
Column Dropped D_73
Column Dropped D_76
Column Dropped R_9
Column Dropped D_82
Column Dropped B_29
Column Dropped D_87
Column Dropped D_88
Column Dropped D_106
Column Dropped R_26
Column Dropped D_108
Column Dropped D_110
Column Dropped D_111
Column Dropped B_39
Column Dropped B_42
Column Dropped D_132
Column Dropped D_134
Column Dropped D_135
Column Dropped D_136
Column Dropped D_137
Column Dropped D_138
Column Dropped D_142
Total dropped columns are 25


In [19]:
#drop variables with missing values >=70% in the test dataframe
i=0
for col in test_df.columns:
    if (test_df[col].isnull().sum()/len(test_df[col])*100) >=70:
        print("Column Dropped", col)
        test_df.drop(labels=col,axis=1,inplace=True)
        i=i+1
        
print("Total dropped columns are", i)

Column Dropped D_42
Column Dropped D_49
Column Dropped D_53
Column Dropped D_66
Column Dropped D_73
Column Dropped D_76
Column Dropped R_9
Column Dropped D_82
Column Dropped B_29
Column Dropped D_87
Column Dropped D_88
Column Dropped D_106
Column Dropped R_26
Column Dropped D_108
Column Dropped D_110
Column Dropped D_111
Column Dropped B_39
Column Dropped B_42
Column Dropped D_132
Column Dropped D_134
Column Dropped D_135
Column Dropped D_136
Column Dropped D_137
Column Dropped D_138
Column Dropped D_142
Total dropped columns are 25


In [20]:
#Dropping Customer ID and S_2 column in training data

def drop_features():
  train_df.drop(columns=['customer_id', 'S_2'], inplace=True)

In [21]:
#Dropping Customer ID and S_2 column in training data

def drop_features():
  test_df.drop(columns=['customer_id', 'S_2'], inplace=True)

In [22]:
#installation
!pip install dtale

Collecting dtale
  Downloading dtale-2.10.0-py2.py3-none-any.whl (14.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting dash-daq
  Downloading dash_daq-0.5.0.tar.gz (642 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m642.7/642.7 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Collecting dash>=2.0.0
  Downloading dash-2.7.1-py3-none-any.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Collecting Flask-Compress
  Downloading Flask

In [23]:
#Importing Libraries
import dtale
d = dtale.show(train_df)
d = dtale.show(test_df)
d.open_browser()

In [24]:
#Installing Library
!pip install xverse

Collecting xverse
  Downloading xverse-1.0.5-py3-none-any.whl (21 kB)
Installing collected packages: xverse
Successfully installed xverse-1.0.5
[0m

In [25]:
#Importing Library and applying WOE

import xverse  #run if required

#Splitting the data as X and y and applying clf.fit

X = train_df.drop('target',axis=1)
y = train_df['target']
from xverse.transformer import WOE
clf = WOE()
clf.fit(X, y)


divide by zero encountered in log



WOE(mono_custom_binning={'B_1': array([-0.29512603,  0.01383925,  0.08034429,  1.32405262]),
                         'B_10': array([-2.95628301e-03,  4.17115742e-02,  2.78950980e-01,  6.39939489e+02]),
                         'B_11': array([3.70907215e-08, 8.62739218e-03, 6.11251584e-02, 1.47613099e+00]),
                         'B_12': array([1.00222961e-06, 1.28415911e-02, 4.57043555e-02, 1.34307985e+01]),
                         'B_13': array([2.38817272e-07, 1.32848...
              'B_38': {Interval(0.999, 2.0, closed='right'): -1.1704582427656727,
                       Interval(2.0, 3.0, closed='right'): 0.3662090466132588,
                       Interval(3.0, 7.0, closed='right'): 1.3984265328174859,
                       'NA': 0.3441083477012124},
              'B_4': {Interval(-0.000999526, 0.0414, closed='right'): -1.6133898147587555,
                      Interval(0.0414, 0.172, closed='right'): -0.280066914091969,
                      Interval(0.172, 2.405, closed='r

In [26]:
#Applying Tranformation
X = clf.transform(X)
X

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_130,D_131,D_133,R_28,D_139,D_140,D_141,D_143,D_144,D_145
0,0.0,-0.115845,-3.420113,-0.433424,-1.434682,-1.230398,0.583619,-1.141257,0.438481,-1.295222,...,-0.263149,-0.196940,-0.138288,0.017896,-0.179662,-0.065485,-0.151869,-0.166064,-0.009324,-0.176297
1,0.0,-0.141561,-3.420113,-0.433424,-1.434682,-1.230398,-0.372960,-1.141257,-0.249094,-1.295222,...,-0.263149,-0.196940,0.252293,-0.014982,-0.179662,-0.043716,-0.161144,0.259180,-0.018481,0.255292
2,0.0,0.057427,-3.420113,0.002019,-0.561140,-1.230398,-0.372960,-1.141257,-0.267670,-0.681252,...,-0.291836,-0.196940,0.252293,-0.003051,-0.179662,0.086126,-0.151869,-0.166064,-0.018481,-0.176297
3,0.0,0.060811,-3.420113,-0.433424,-1.434682,-1.230398,-0.353030,-1.141257,-0.249094,-1.295222,...,-0.291836,0.317096,-0.138288,0.017896,-0.149252,-0.065485,-0.151869,0.259180,-0.018481,0.255292
4,0.0,0.114284,-3.420113,-0.433424,-0.561140,-1.230398,-0.372960,-1.141257,-0.267670,-0.681252,...,0.440297,-0.196940,-0.138288,-0.014982,-0.179662,-0.065485,0.254139,-0.166064,0.008231,0.255292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,0.094700,1.517653,0.354671,1.174453,1.195435,-0.372960,0.020148,-0.249094,1.195917,...,-0.263149,0.317096,-0.138288,0.017896,-0.149252,-0.065485,-0.151869,-0.153091,-0.009324,-0.138303
99996,0.0,0.398176,1.517653,0.002019,1.174453,1.195435,-0.372960,0.020148,-0.249094,1.195917,...,-0.291836,0.317096,-0.138288,0.017896,-0.149252,0.086126,-0.151869,-0.153091,0.008231,-0.176297
99997,0.0,-0.058454,1.517653,0.002019,1.174453,1.195435,-0.372960,0.020148,-0.267670,1.195917,...,-0.291836,-0.194257,-0.161791,-0.003051,0.267081,-0.065485,0.254139,-0.166064,-0.018481,-0.138303
99998,0.0,-0.452111,-0.713541,-0.433424,-1.434682,-0.722322,-0.372960,-0.982775,0.438481,-0.681252,...,-0.263149,-0.194257,-0.161791,-0.003051,0.267081,-0.043716,0.254139,0.259180,0.008231,0.255292


In [27]:
#RANDOM FOREST CLASSIFIER
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=20)
clf = WOE()
clf.fit(Xtrain, ytrain)
Xtrain = clf.transform(Xtrain)
Xtest = clf.transform(Xtest)
rf = RandomForestClassifier(n_estimators=100,class_weight='balanced')
rf.fit(Xtrain,ytrain)
print("Training Accuracy")
print(rf.score(Xtrain,ytrain))
print("Testing Accuracy")
print(rf.score(Xtest,ytest))
predicted = rf.predict(Xtest)
print(confusion_matrix(ytest,predicted))
print(classification_report(ytest,predicted))

scoresdt = cross_val_score(rf,Xtrain,ytrain,cv=10,scoring='f1')
print(scoresdt)
print("Average f1")
print(np.mean(scoresdt))

Training Accuracy
1.0
Testing Accuracy
0.89755
[[14166   759]
 [ 1290  3785]]
              precision    recall  f1-score   support

           0       0.92      0.95      0.93     14925
           1       0.83      0.75      0.79      5075

    accuracy                           0.90     20000
   macro avg       0.87      0.85      0.86     20000
weighted avg       0.90      0.90      0.90     20000

[0.76950904 0.77158701 0.76342677 0.7755102  0.78112713 0.77878396
 0.75724638 0.79250128 0.78452752 0.76360768]
Average f1
0.7737826976662672


In [28]:
#liner regression prediction
rf.predict(Xtest)

array([0, 0, 0, ..., 0, 0, 0])

In [29]:
#liner regression predict proba
rf.predict_proba(Xtest)

array([[0.9 , 0.1 ],
       [0.9 , 0.1 ],
       [0.52, 0.48],
       ...,
       [0.91, 0.09],
       [1.  , 0.  ],
       [0.89, 0.11]])

In [30]:
prediction = rf.predict_proba(Xtest)
final_predictions = prediction[:,1]

In [31]:
output = pd.DataFrame({'customer_ID': Xtest.index,'prediction': ytest})
output.to_csv('submission.csv', index=False, header=True)