# Packages Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import os
import sys
sys.path.append(os.path.realpath('..')) #note to self: this works, only when notebook is alrdy saved in directory. So, first save notebook and then use this line of code.
from scipy.stats import normaltest
from scipy.stats import anderson
from scipy.stats import kendalltau
from scipy.stats import pearsonr
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
import imblearn
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier

# Data Loading

In [None]:
# Loading application records into a pandas dataframe
appl_df = pd.read_csv( '../input/credit-card-approval-prediction/application_record.csv', index_col= 'ID' )

In [None]:

# Loading credit records into a pandas dataframe
cred_df = pd.read_csv( '../input/credit-card-approval-prediction/credit_record.csv', index_col= 'ID' )

In [None]:
appl_df.head().T

In [None]:
cred_df.head().T

In [None]:
print( appl_df.shape )
print('')
appl_df.info()

In [None]:
print( cred_df.shape )
print('')
cred_df.info()

# Data Cleaning

## Removing duplicated data
In appl_df, althought there are different customer ID's, there is equal information over some disticnt ID values.  
An example of this is the customers with ID's 5008804 and 5008805:

In [None]:
appl_df.loc[[5008804, 5008805]].T

Using drop_duplicates in this case it's not possible, since there might be cases where one duplicate ID in application dataset might not be duplicated in credit records dataset.  
So we need to account ID's common to both dataframes 


In [None]:
valid_indexes = list( set(appl_df.index).intersection(set(cred_df.index)) ) 
len( valid_indexes )

Filtering both dataframes by common indexes

In [None]:
#appl_df_clean =  appl_df.loc[valid_indexes]
#cred_df_clean = cred_df.loc[valid_indexes]

appl_df = appl_df.loc[valid_indexes]
cred_df = cred_df.loc[valid_indexes]

In [None]:
print(appl_df.shape)
print('')
appl_df.head().T

Adding a unique customer ID in appl_df

In [None]:
appl_df_clean = appl_df.sort_values(by = appl_df.columns.to_list()) #safety step - performance doesen't seem to be affected that much.
grouped_cust = appl_df.sum(axis=1).map(hash).reset_index().rename(columns={0:'customer_id'})

id_counts_df = pd.DataFrame( grouped_cust.groupby('customer_id').size().sort_values(ascending=False), columns = ['id_count'] )
appl_df_clean['cust_id'] = appl_df.sum(axis=1).map(hash)

In [None]:
appl_df_clean.head()

Adding the same unique customer ID to cred_df

In [None]:
grouped_cust = grouped_cust.set_index('ID')
cred_df_trsf = cred_df.merge(grouped_cust, how = 'inner', on = 'ID').reset_index()[['customer_id','ID', 'MONTHS_BALANCE', 'STATUS']]

cred_df_g = cred_df_trsf.sort_values(by=['customer_id', 'ID', 'MONTHS_BALANCE'], ascending = [True, True, False]).reset_index(drop=True)
cred_df_g['interaction_ID'] = cred_df_g.groupby(['customer_id','ID'], sort = False).ngroup().add(1) # not woring as I wanted, but it's good enough for what needs to be done
cred_df_g.drop(columns = ['ID'], inplace=True)
cred_df_g = cred_df_g[['customer_id', 'interaction_ID', 'MONTHS_BALANCE', 'STATUS']]

Transforming cred_df in order to return a list of customers labeled by theyr behaviour type.  
This will help with getting our Y label

In [None]:
cred_df_g['month_behav'] = np.where( cred_df_g.STATUS.isin(['2','3','5']), 'b', 'g' )

cust_beh = pd.DataFrame( round( cred_df_g.groupby(['customer_id', 'month_behav']).size() / cred_df_g.groupby(['customer_id']).size() * 100, 2), columns = ['behav_kpi']).reset_index().set_index('customer_id')

bad_cust = \
cust_beh[
    ( (cust_beh.month_behav=='g') & (cust_beh.behav_kpi <= 50) ) | 
    ( (cust_beh.month_behav=='b') & (cust_beh.groupby('customer_id').size()==1) )
        ]
bad_cust['customer_type'] = 'bad'
bad_cust.drop(columns=['month_behav', 'behav_kpi'], inplace=True)

good_cust = \
    cust_beh[
        ( (cust_beh.month_behav=='g') & (cust_beh.behav_kpi > 50) ) | 
        ( (cust_beh.month_behav=='g') & (cust_beh.groupby('customer_id').size()==1) )
            ]
good_cust['customer_type'] = 'good'
good_cust.drop(columns=['month_behav', 'behav_kpi'], inplace=True)

cred_df_clean = pd.concat([bad_cust, good_cust])
cred_df_clean['months_in_book'] = cred_df_g.groupby('customer_id').size()
cred_df_clean['contracts_nr'] = cred_df_g.groupby(['customer_id'])['interaction_ID'].nunique()

## Checking and Cleaning Missing Data

In [None]:
# Checking how many values are missing in credit records dataset
cred_df_clean.isnull().sum()

In [None]:
# Checking how many values are missing in application dataset
appl_df_clean.isnull().sum()

This means we only need to clean missing data in application dataset -> Ocupation_type column.  
Let's have a peek at it:

In [None]:
appl_df_clean.OCCUPATION_TYPE.unique()

Bam! There's our missing value right there!  
Let's replace the missing values by 'Not Available'.

In [None]:
appl_df_clean['OCCUPATION_TYPE'] = appl_df_clean['OCCUPATION_TYPE'].fillna('Not Available')

## Reclassifying some Flags
In application dataset there are 2 flag columns that have Y/N labels, while other flags in this dataset are binary variables.   
For congruence, let's pass those Y/N labels into 1 and 0's, respectively. 

In [None]:
dic = {
    'Y' : 1,
    'N' : 0
}

appl_df_clean['FLAG_OWN_CAR'] = appl_df_clean['FLAG_OWN_CAR'].replace(dic)
appl_df_clean['FLAG_OWN_REALTY'] = appl_df_clean['FLAG_OWN_REALTY'].replace(dic)

appl_df_clean.head()

### Sorting application dataset's columns  
We really don't need this extra step, but i like organized tables. Things get cleaner in my head that way.

In [None]:
flag_cols = [x for x in appl_df_clean.columns if x.startswith('FLAG_')]                                                          
cat_cols  = [x for x in appl_df_clean.columns if x.startswith('CODE_') or x.startswith('NAME_') or x.startswith('OCCUPATION_') ] 
num_cols  = [x for x in appl_df_clean.columns if x.startswith('CNT_')] + [x for x in appl_df_clean.columns if x.startswith('AMT_') or x.startswith('DAYS_')]          

#checkzone:
# 
# +1 accounts for cust_id column who does not fit in any column category because it's a "dummy" column just to join with cred_df later on
#
len(flag_cols) + len( cat_cols ) + len( num_cols ) + 1 == len( appl_df_clean.columns )  

In [None]:
appl_df_clean = appl_df_clean[flag_cols + cat_cols + num_cols + ['cust_id']]

appl_df_clean.head().T

## Joining application records dataset with customer labels dataset

In [None]:
cred_df_clean.reset_index(inplace=True)

df = \
appl_df_clean.reset_index().merge(
    cred_df_clean,
    left_on = appl_df_clean.cust_id,
    right_on = cred_df_clean.customer_id,
    how = 'inner'
).drop(columns = ['key_0','cust_id', 'customer_id']).set_index('ID')

# Descriptive Analytics

Now that we have our datasets cleaned we can prepare some descriptive analytics about them

## Applications Dataset

### Flag features distributions

In [None]:
fig, axes = plt.subplots(ncols=len( flag_cols ), figsize=(20,5))
for col, ax in zip(df[flag_cols], axes):
    df[col].value_counts().sort_values().plot.barh(ax=ax, title=col + ' histogram')

plt.tight_layout()    
plt.show()

From the above chart sequence we can imediatly see that all the customers, recorded in the application dataset, have a mobile phone.  
This fact implies that it is irrelevant to use this feature for deault modeling, since it won't help to find differences amongst customers.  

Other fact to note, is that in every flag feature, except FLAG_MOBIL, there is a notable difference between customers who have the caracteristic and the ones who don't.  

In [None]:
#removing FLAG_MOBIL from the app_df_clean dataset since it holds no value for this study
df.drop(columns=['FLAG_MOBIL'], inplace=True)
#removing FLAG_MOBIL from flag_cols aswell
flag_cols.remove('FLAG_MOBIL')

df.head()

In [None]:
#fig, axes = plt.subplots(ncols=len(flag_cols), figsize=(20,5))
#
#for col, ax in zip(appl_df_clean[flag_cols], axes):
#    n, bins, patches = \
#        plt.hist(
#            appl_df_clean[col],
#            orientation='horizontal',
#            bins = 3,
#            align= 'mid'
#            )
#    
#    plt.title(col + ' histogram of frequencies')
#    plt.ylabel(col)
#    plt.yticks(ticks=[1,0])
#    plt.ylim(0,1)
#    plt.xlabel('Frequency')
#
#    #plt.xlim([-1,1])
#
#    # Make some labels.
#
#    for rect in patches: # https://stackoverflow.com/questions/28931224/adding-value-labels-on-a-matplotlib-bar-chart
#        # Get X and Y placement of label from rect.
#        x_value = rect.get_width()
#        y_value = rect.get_y() + rect.get_height() / 2
#
#        # Number of points between bar and label. Change to your liking.
#        space = -100
#        # Vertical alignment for positive values
#        ha = 'left'
#
#        # Use X value as label and format number with one decimal place
#        #label = '{:,.0f}'.format(x_value)
#        label = f"{x_value:,.0f}"
#
#        # Create annotation
#        plt.annotate(
#            label,                      # Use `label` as label
#            (x_value, y_value),         # Place label at end of the bar
#            xytext=(space, 0),           # Horizontally shift label by `space`
#            textcoords="offset points", # Interpret `xytext` as offset in points
#            va='center',                # Vertically center label
#            ha=ha)                      # Horizontally align label differently for
#                                        # positive and negative values.
#    ;

### Categorical Features Distributions

In [None]:
fig, axes = plt.subplots( nrows= len(cat_cols),  figsize=(10,20))
for col, ax in zip(df[cat_cols], axes):
    df[col].value_counts().sort_values().plot.barh(ax=ax, title=col + ' histogram')

plt.tight_layout()    
plt.show()

From the above charts we can immediately notice that there are notable differences in all categorical features.  


In [None]:
pd.DataFrame( df[cat_cols].groupby(cat_cols).size().sort_values(ascending=False), columns = ['Value']).reset_index().head().T

From the above table, we can check what the most frequent customer profiles are.

### Numeric Features Distributions

In [None]:
num_cols = num_cols + ['months_in_book', 'contracts_nr']

In [None]:
fig, axes = plt.subplots( nrows= len(num_cols), figsize=(10,20))

for col, ax in zip(df[num_cols], axes):
    sns.distplot( df[col], ax=ax )

plt.tight_layout()    
plt.show()
;

Analysing the numeric features distribution plots, we can imediatly see that all of this features are assimetric over the lower values, except DAYS_BITH (perhaps its normal distibuted? We'll check this ahead).  
  
This says alot about the majority of the profiles:  
Most customers have:   
- none or few children,  
- have small families,  
- have low incomes,  
- have about 43 years old  
- are emplyoed for not too long, although there is a significant proportion who are unemployed ( and yes, those positive days are way too high!!! ) 


From here we can also see that we, probably, will have serious problems with outliers lying in the data.  

### Numeric features | Outliers detection

In [None]:
desc_num = round( df[num_cols].describe(), 0)
desc_num    

WoW! 
Check out that maximum value for DAYS_EMPLOYED!  
365.243 days unemployed translates into about 1.001 years unemployed! (according to data dic, positive values for this feature mean the person is unemployed)  
This is impossible :)  
  
Actually, if we filter app_df_clean by positive values in DAYS_EMPLOYED and check the minimum value for the resulting DAYS_EMPLOYED sample, we get 365243 days  
which means that all of the positive values in app_df_clean are, most defenetly, wrongly registered.  


In [None]:
df[df.DAYS_EMPLOYED > 0 ]['DAYS_EMPLOYED'].min()

In [None]:
appl_df[appl_df.DAYS_EMPLOYED > 0 ]['DAYS_EMPLOYED'].min()

This is bad...  
This leave us with no idea for how long people are unemployed :(

Let's check who are the customers who have that much time unemployed...

In [None]:
appl_df_unem = df[appl_df_clean.DAYS_EMPLOYED > 0 ]
appl_df_unem.head().T

Seems like they are mostly pensionists...  
Let's confirm that fact:

In [None]:
print( appl_df_unem['NAME_INCOME_TYPE'].unique() )
print('')
print( appl_df_unem['OCCUPATION_TYPE'].unique() )

In [None]:
print( appl_df[appl_df.DAYS_EMPLOYED == 365243]['NAME_INCOME_TYPE'].unique() )
print('')
print(appl_df[appl_df.DAYS_EMPLOYED == 365243]['OCCUPATION_TYPE'].unique())

So... this lead us to conclude that whoever built this dataframe, used 365.243 days figure to register pensionists who have no occupation.  
Perhaps one solution is to convert this feature into intervals.  
It will help our model to understand that some people are not employed (and neither they are employed) because they are already pensionists.

Let's plot some box-whiskers plots in ordert o see better the outliers in our data samples

https://www.mathbootcamps.com/how-to-make-a-boxplot-box-and-whiskers-plot-by-hand/

In [None]:
fig, axes = plt.subplots( nrows= len(num_cols), figsize=(10,20))

for col, ax in zip(df[num_cols], axes):
    sns.boxplot( y = df[col], ax=ax, orient = 'h' )

plt.tight_layout()    
plt.show()
;

In [None]:
#fig, axes = plt.subplots( nrows= len(num_cols), figsize=(10,20))
#
#for col, ax in zip(appl_df_clean[num_cols], axes):
#    series_smoothed = appl_df_clean[col]
#    series_smoothed = series_smoothed[ series_smoothed <= series_smoothed.quantile(0.75) ]
#    sns.boxplot( y = series_smoothed, ax=ax, orient = 'h' )
#
#plt.tight_layout()    
#plt.show()
#;

As we can see, from the box plots, there are alot of outliers in most of numeric features of our dataset.  
One common strategy to deal with outliers, is to replace them with mean, median, mode, etc. However, if we do that I belive it would not be beneficial for our model since it would biase the data.  
I guess one solution would be to scale this variables with robust scaling:  https://machinelearningmastery.com/robust-scaler-transforms-for-machine-learning/

### Testing the "Normality" of DAYS_BIRTH feature.  
We will use 2 tests for this evaluation, in order to be sure of the results: 

https://machinelearningmastery.com/a-gentle-introduction-to-normality-tests-in-python/

In [None]:
# D’Agostino’s K^2 Test
stat, p = normaltest(df['DAYS_BIRTH'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Sample looks Gaussian (fail to reject H0)')
else:
	print('Sample does not look Gaussian (reject H0)')

In [None]:
# Anderson-Darling Test
result = anderson(df['DAYS_BIRTH'])
print('Statistic: %.3f' % result.statistic)
p = 0
for i in range(len(result.critical_values)):
	sl, cv = result.significance_level[i], result.critical_values[i]
	if result.statistic < result.critical_values[i]:
		print('%.3f: %.3f, data looks normal (fail to reject H0)' % (sl, cv))
	else:
		print('%.3f: %.3f, data does not look normal (reject H0)' % (sl, cv))

Safe to conclude that DAYS_BIRTH feature is not coming from a purely Gaussian/Normal probability distribution. 

### Numeric Feature correlations

We will use Kendal correlation coeficients, since all numeric variables are not coming from a Normal distribution (Pearson correlation coeficient requires that data is coming from Normal distributions).  
Kendall correlation coeficients are robust to lack of normality, and for that reason we hope to derive more accurate correlation values.  
https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient

In [None]:
help(pearsonr)

In [None]:
help(kendalltau)

In [None]:
appl_corr = round(df[num_cols].corr('kendall'), 2)

plt.figure(figsize=(10, 8))

mask = np.zeros_like(appl_corr)
mask[np.triu_indices_from(mask)] = True

ax = sns.heatmap(
    appl_corr, 
    annot=True, 
    square=True,
    mask=mask,
    xticklabels=True, 
    yticklabels=True    
    )

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
ax.set_ylim([0,7])
plt.show()
;

We can see that the follwing features might have strong relationships: 
* CNT_FAM_MEMBERS and CNT_CHILDREN,  
* months_in_book and contracts_nr.  

Let's use Hypotesis testing to check if there's dependence or not between those pairs of features:   

In [None]:
def kendall_pvalue(pandas_series_1, pandas_series_2):
    """
    Takes in 2 pandas series, and calculates p-value for hypotesis testing about dependence between those 2 data series.
    """
    stat, p = kendalltau(pandas_series_1, pandas_series_2)
    if p > 0.05:
        return print( 'There is not enough statistical evidence to prove that: ' + pandas_series_1.name + ' is dependent of ' + pandas_series_2.name + '.'+'\n'+ 'Kendall Corr. value is: ' + str( round(stat,2) ) +' | p-value is: ' + str( round(p,4) )  + '.' )
    else:
        return print( 'There is enough statistical evidence to prove that: ' + pandas_series_1.name + ' is dependent of ' + pandas_series_2.name +'.' +'\n'+ 'Kendall Corr. value is: ' + str( round(stat,2) ) +' | p-value is: ' + str( round(p,4) )  + '.' )

In [None]:
kendall_pvalue(df['CNT_CHILDREN'], df['CNT_FAM_MEMBERS'])
print()
kendall_pvalue(df['months_in_book'], df['contracts_nr'])

In [None]:
plt.scatter(df['CNT_CHILDREN'], df['CNT_FAM_MEMBERS']);

In [None]:
plt.scatter(df['months_in_book'], df['contracts_nr']);

Aparently the idea of creating contract_nr feature wasn't so furtonate.  
We'll drop this feature along with CNT_CHILDREN given the high correlations with months_in_book and contract_nr, respectively.

In [None]:
num_cols.remove('CNT_CHILDREN')
num_cols.remove('contracts_nr')
df.drop(columns=['CNT_CHILDREN','contracts_nr'], inplace = True)
df.head()

## Credit Records Data Set

From the below plot we can imediatly see that most customers, that have loans to pay on a given month, pay them on time with max 59 days overdue.  
However, there is a slight portion of them that either:  
- have bad debts;  
- pay within 60 or 149 days overdue.  

Those last ones can be classified as the 'Bad' customers, because if we think (as a bank) of not receiveing the due amounts for more then 60 days it will have a very negative impact in the bank treasury.

In [None]:
cred_df['STATUS'].value_counts().sort_values().plot.barh(title= 'STATUS histogram')

plt.tight_layout()    
plt.show()

Below dataframe give us the customers labeled as 'good' or 'bad' based on classification done previously

In [None]:
cred_df_clean.head()

Percentage of bad clients in the data:

In [None]:
percent_bad_customers = 100 * len( cred_df_clean[cred_df_clean.customer_type=='bad'] ) / cred_df_clean.shape[0] 

print( "{0:.3f}%".format( percent_bad_customers) )

In [None]:
#example of a "bad" customer
customer_id = 5142361
print( appl_df_clean.loc[customer_id] )
print('')
print( cred_df.loc[customer_id].sort_values(by=['MONTHS_BALANCE'], ascending=False) )

## Shortening Categorical Feature Classes

In [None]:
cat_cols

In [None]:
# Variable list to transform
cat_col_t = [ col for col in cat_cols if 'GENDER' not in col ] # we exclude gender as it only contains 2 categories
cat_col_t

### NAME_INCOME_TYPE

In [None]:
df['NAME_INCOME_TYPE'].value_counts()

Let's divide this variable as 'Working', 'Pensionioner' and 'Student', making 'State servant', 'Commercial associate' merge into 'Working' category. 

In [None]:
dic = {
    'Commercial associate' : 'Working',
    'State servant' : 'Working',
}
df['NAME_INCOME_TYPE'] = df['NAME_INCOME_TYPE'].replace(dic)

### NAME_EDUCATION_TYPE

In [None]:
df['NAME_EDUCATION_TYPE'].value_counts()

Let's divide this variable as 'Secondary / secondary special', 'Higher education' and 'Lower secondary'.

In [None]:
dic = {
    'Incomplete higher' : 'Secondary / secondary special',
    'Academic degree' : 'Higher education',
    'Lower secondary' : 'Basic'
}
df['NAME_EDUCATION_TYPE'] = df['NAME_EDUCATION_TYPE'].replace(dic)

### NAME_FAMILY_STATUS

In [None]:
df['NAME_FAMILY_STATUS'].value_counts()

In [None]:
dic = {
    'Civil marriage' : 'Married'
}

#df['NAME_FAMILY_STATUS'].replace(dic).value_counts()
df['NAME_FAMILY_STATUS'] = df['NAME_FAMILY_STATUS'].replace(dic)

### NAME_HOUSING_TYPE

In [None]:
df['NAME_HOUSING_TYPE'].value_counts()

In [None]:
dic = {
    'House / apartment' : 'Rented apartment',
    'Co-op apartment' : 'Rented apartment',
    'Municipal apartment': 'Municipal or Office apartment',
    'Office apartment': 'Municipal or Office apartment'
}

df['NAME_HOUSING_TYPE'] = df['NAME_HOUSING_TYPE'].replace(dic)

### OCCUPATION_TYPE

In [None]:
df['OCCUPATION_TYPE'].value_counts()

Hmm... grouping professions type will be though without any grouping criteria.  
Let's try to group by income.  

Even though we have a criteria, is it possible that we introduce a biase with this step?  
I'm talking about spurius relationships:  https://en.wikipedia.org/wiki/Spurious_relationship

In [None]:
# https://stackoverflow.com/questions/21912634/how-can-i-sort-a-boxplot-in-pandas-by-the-median-values
data = pd.DataFrame(df.groupby(['OCCUPATION_TYPE'])['AMT_INCOME_TOTAL'].mean()).reset_index().sort_values(by=['AMT_INCOME_TOTAL'], ascending=False)

plt.figure(figsize=(20,5))
ax = sns.boxplot(data=df.sort_values(by='AMT_INCOME_TOTAL', ascending = False), x='OCCUPATION_TYPE', y='AMT_INCOME_TOTAL', order = data['OCCUPATION_TYPE'], linewidth= 1)

ax.set_xticklabels(ax.get_xticklabels(),rotation=85)

#ax.set_xlim(0,20)
;

In [None]:
dic = {
    'Managers' : 'Group 1',
    'Realty agents' : 'Group 1',
    'Drivers' : 'Group 1',
    'Accountants' : 'Group 1',
    'IT staff' : 'Group 2',
    'Private service staff' : 'Group 2',
    'High skill tech staff' : 'Group 2',
    'HR staff' : 'Group 2',
    'Core staff' : 'Group 2',
    'Laborers' : 'Group 3',
    'Security staff' : 'Group 3',
    'Sales staff' : 'Group 3',
    'Not Available' : 'Group 3',
    'Secretaries' : 'Group 3',
    'Medicine staff' : 'Group 4',
    'Waiters/barmen staff' : 'Group 4',
    'Cleaning staff' : 'Group 4',
    'Cooking staff' : 'Group 4',
    'Low-skill Laborers' : 'Group 4'
}

#df['OCCUPATION_TYPE'].replace(dic)
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].replace(dic)

In [None]:
data = pd.DataFrame(df.groupby(['OCCUPATION_TYPE'])['AMT_INCOME_TOTAL'].mean()).reset_index().sort_values(by=['AMT_INCOME_TOTAL'], ascending=False).round(1)

plt.figure(figsize=(8,6))
ax = sns.boxplot(data=df.sort_values(by='AMT_INCOME_TOTAL', ascending = False), x='OCCUPATION_TYPE', y='AMT_INCOME_TOTAL', order = data['OCCUPATION_TYPE'], linewidth= 1)

ax.set_xticklabels(ax.get_xticklabels(),rotation=85);

# Data Pre Processing

In this section we are going to pre-process the dataset to feed the model.  
We will split features by theyr type first: flag (or binary), numeric and categorical, and in each of those splits we are doing train and test splits to guarantee that numeric features are not scaled/normalized before the train/test split operation.

## Defining a couple constants first

In [None]:
rand_st = 123
test_size =0.3

## Encoding Categorical Features

In [None]:
df.head()
#df.to_csv('C:\\ML Analytics\\05 - Kaggle\\02 - Credit Card Aproval\\00 - dataset\\df.csv')

In [None]:
df_cat = df[cat_cols]
df_dumm = pd.get_dummies(df_cat,  prefix_sep='==')

df_dumm.head()

Dropping categorical columns from df dataframe, and joining the "dummy" versions

In [None]:
df.drop(columns = cat_cols, inplace= True)

In [None]:
df_dumm.head()

In [None]:
df = pd.concat([df_dumm, df], axis = 1)
df.head()

## Setting Flag Features to same data type and splitting for train and test sets

In [None]:
df[flag_cols] = df[flag_cols].astype('uint8')

df.head()

## Numeric Features Robust Scaling

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html  

Scale features using statistics that are robust to outliers.

This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile).

Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Median and interquartile range are then stored to be used on later data using the transform method.

Standardization of a dataset is a common requirement for many machine learning estimators. Typically this is done by removing the mean and scaling to unit variance. However, outliers can often influence the sample mean / variance in a negative way. In such cases, the median and the interquartile range often give better results.

So, firstly we separate out datsets into train and test:

In [None]:
X = df.drop(columns=['customer_type'])
Y = df['customer_type']

X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = test_size, stratify = Y, random_state = rand_st )

And now we apply the sclaing to numeric features:

In [None]:
#num_cols = num_cols + ['months_in_book']
df_num_X_train = X_train[num_cols]
df_num_X_test = X_test[num_cols]

# perform a robust scaler transform of the dataset
trans = RobustScaler(with_centering=False, with_scaling=True)

df_num_X_train_s = trans.fit_transform(df_num_X_train)
df_num_X_train = pd.DataFrame( df_num_X_train_s, columns = df_num_X_train.columns, index = df_num_X_train.index)

df_num_X_test_s = trans.fit_transform(df_num_X_test)
df_num_X_test = pd.DataFrame( df_num_X_test_s, columns = df_num_X_test.columns, index = df_num_X_test.index)

In [None]:
print( "train set size: {0:,}".format( len( df_num_X_train ) ) )
print('')
print( "test set size: {0:,}".format( len( df_num_X_test ) ) )

### Building X and Y dataframes to feed the models

In [None]:
X_train.drop(columns = num_cols, inplace = True)
X_train = pd.concat([X_train, df_num_X_train] ,axis=1)

X_train.head()

In [None]:
X_test.drop(columns = num_cols, inplace = True)
X_test = pd.concat([X_test, df_num_X_test] ,axis=1)

X_test.head()

Transforming the target variable into bynary format

In [None]:
cust_t_dic = { 'good':0, 'bad': 1 }

Y_train = Y_train.replace(cust_t_dic).astype('uint8')
Y_test = Y_test.replace(cust_t_dic).astype('uint8')

# Dealing with Target classes Imbalance  
  
If we count the differences between classes from the target variable, we can see that there is a hudge difference between them.  


In [None]:
print(Y_train.value_counts())
print()
print( Y_test.value_counts())

To try to deal with this difference, we will use SMOTE mixing oversampling with undersampling for better results:  
https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/  
https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/

In [None]:

oversample = BorderlineSMOTE(sampling_strategy=0.1)
undersample = RandomUnderSampler(sampling_strategy=0.5)

# Modelling


https://scikit-learn.org/stable/modules/grid_search.html

## Logistic Regression  
https://en.wikipedia.org/wiki/Logistic_regression  

https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

https://machinelearningmastery.com/cost-sensitive-logistic-regression/  
https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html  

The scikit-learn library provides an implementation of the best practice heuristic for the class weighting.

It is implemented via the compute_class_weight() function and is calculated as:

        n_samples / (n_classes * n_samples_with_class)

We can test this calculation manually on our dataset.
  
For example, we have 10,000 examples in the dataset, 9900 in class 0, and 100 in class 1.  
The weighting for class 0 is calculated as:

    weighting = n_samples / (n_classes * n_samples_with_class)
    weighting = 10000 / (2 * 9900)
    weighting = 10000 / 19800
    weighting = 0.05

The weighting for class 1 is calculated as:

    weighting = n_samples / (n_classes * n_samples_with_class)
    weighting = 10000 / (2 * 100)
    weighting = 10000 / 200
    weighting = 50

------------------------------  

LogisticRgression has a parameter that alows to automatically apply this heuristic method, by assigning class_weight = 'balanced' in the model parameters.



In [None]:
print( len(Y_train) )
print()
print( Y_train.value_counts() )

In [None]:
weightings_0 = len(Y_train) / ( len( Y_train.unique() ) * Y_train.value_counts()[0] )
weightings_0 #weight of observations classified as 0

In [None]:
weightings_1 = len(Y_train) / ( len( Y_train.unique() ) * Y_train.value_counts()[1] )
weightings_1 #weight of observations classified as 1

In [None]:
def fit_pipeline(pipeline, X_train_data, Y_train_data, X_test_data, param_grid, cv, scoring_grid, scoring_fit):
    """
    Defines a brute force pipeline to evaluate model, according to defined parameter and scoring grids.
    """
    
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring= scoring_grid,
        refit = scoring_fit,
        verbose=2
        )

    fitted_model = grid.fit(X_train_data, Y_train_data)
    pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [None]:
#Defining classifier parameters
weighting = [{0: 0.50, 1: 981}, {0: 0.05, 1: 981}, {0: 0.005, 1: 981}, {0: 0.005, 1:9810}, {0:0.005,1:98100}]

param_grid = dict( 
    model__penalty = ['l1','l2'], 
    model__class_weight = ['balance'] + weighting,
    #model__C = [1,10,100,1000], #first attempt
    #model__C = [0.0001, 0.001, 0.01, 1], #second attempt
    model__C = [1, 1.05, 1.1 ], #third attempt
    model__random_state = [rand_st]
    ) 


#Defining classifier function
lr = LogisticRegression()

#Defining Pipeline
#Note to self: Whenever using the pipeline, you will need to send the parameters in a way so that pipeline can understand which parameter is for which of the step in the list. For that it uses the name you provided during Pipeline initialisation. 
#https://stackoverflow.com/questions/58815016/cross-validating-with-imblearn-pipeline-and-gridsearchcv
steps = [('over', oversample), ('under', undersample), ('model', lr)]
pipeline =  Pipeline(steps=steps)

In [None]:
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=rand_st)
# define grid search
#scoring = {'AUC': 'roc_auc', 'Precision': make_scorer(precision_score), 'F1': make_scorer(f1_score), 'Recall': make_scorer(recall_score)}
scoring = { 'F1': make_scorer(f1_score)}

Choosing F1 score here, since its a better scorer  when data is imbalanced, which is the case.  
Some references:  
https://blog.exsilio.com/all/accuracy-precision-recall-f1-score-interpretation-of-performance-measures/  
https://en.wikipedia.org/wiki/F1_score  
https://datascience.stackexchange.com/questions/65341/f1-score-vs-accuracy-which-metric-is-more-important  



In [None]:
fitted_model, pred = fit_pipeline(pipeline, X_train, Y_train, X_test, param_grid, cv, scoring, 'F1')

### Logistic Regression | Performance

In [None]:
print('Best Penalty:', fitted_model.best_estimator_.get_params()['model__penalty'])
print('Best C:', fitted_model.best_estimator_.get_params()['model__C'])
print('Best Class_weight:', fitted_model.best_estimator_.get_params()['model__class_weight'])
print('Best F1:', fitted_model.best_score_.round(2))


In [None]:
#Pred = lr_optimized.predict(X_test)

class_report = classification_report(Y_test, pred, output_dict=True)

cr_df = round( pd.DataFrame(class_report).transpose(), 2)

cr_df

In [None]:
# Confusion Matrix

cm = metrics.confusion_matrix(Y_test, pred)

fig, ax = plt.subplots(figsize=(8,6))

sns.heatmap(cm, annot=True, fmt=",.0f", linewidths=.5, square = True);

plt.ylabel('Actual label')
ax.set_ylim([0,2])

plt.xlabel('Predicted label');

all_sample_title = 'F1: {:.2f}'.format(fitted_model.best_score_)
plt.title(all_sample_title, size = 15);

## XGBoost

https://en.wikipedia.org/wiki/XGBoost  
https://xgboost.readthedocs.io/en/latest/python/python_intro.html#data-interface

Configuring parameters  
https://xgboost.readthedocs.io/en/latest/parameter.html

In [None]:
#Defining classifier parameters
#brute force scan for all parameters, here are the tricks: https://www.kaggle.com/phunter/xgboost-with-gridsearchcv
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
param_grid = dict( 
    model__random_state = [rand_st],
    model__nthread = [4], #when use hyperthread, xgboost may become slower
    model__objective = ['binary:logistic'],
    #model__learning_rate = [0.049, 0.050, 0.051], #so called `eta` value
    model__learning_rate = [0.051], 
    #model__max_depth = [6, 7, 8],
    model__min_child_weight= [11],
    #model__silent=[1],
    #model__subsample=[0.7, 0.8, 0.9],
    #model__colsample_bytree=[0.7, 0.8] ,
    #model__n_estimators=[400, 700, 1000], #number of trees, change it to 1000 for better results
    model__n_estimators=[1000], 
    #model__missing=[-999],
    model__seed=[1337]
    ) 


#Defining classifier function
xgb_model = XGBClassifier()

#Defining Pipeline

steps = [('over', oversample), ('under', undersample), ('model', xgb_model)]
pipeline =  Pipeline(steps=steps)

In [None]:
fitted_xgb_model, pred_xgb = fit_pipeline(pipeline, X_train, Y_train, X_test, param_grid, cv, scoring, 'F1')

### XGB | Performance

First attempt (xgboost configured for maximizing roc_auc)  
   
Best learning rate:  0.051  
Best max depth:  7  
Best nr. estimators:  1000  
Best roc_auc:  0.78  

In [None]:
print('Best learning rate:', fitted_xgb_model.best_estimator_.get_params()['model__learning_rate'])
print('Best max depth:', fitted_xgb_model.best_estimator_.get_params()['model__max_depth'])
print('Best nr. estimators:', fitted_xgb_model.best_estimator_.get_params()['model__n_estimators'])
print('Best F1:', fitted_xgb_model.best_score_.round(2))

In [None]:
class_report = classification_report(Y_test, pred_xgb, output_dict=True)

cr_df = round( pd.DataFrame(class_report).transpose(), 2)

cr_df

In [None]:
# Confusion Matrix

cm = metrics.confusion_matrix(Y_test, pred_xgb)

fig, ax = plt.subplots(figsize=(8,6))

sns.heatmap(cm, annot=True, fmt=",.0f", linewidths=.5, square = True);

plt.ylabel('Actual label')
ax.set_ylim([0,2])

plt.xlabel('Predicted label');

all_sample_title = 'F1: {:.2f}'.format(fitted_xgb_model.best_score_)
plt.title(all_sample_title, size = 15);