# Packages Imports

In [None]:
# Detailed report can be found at https://drive.google.com/file/d/154TRWVyGQ152F3_K1kq7wfIe7jcIgrEN/view?usp=sharing 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from scipy.stats import normaltest
from scipy.stats import norm
from scipy.stats import kendalltau
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix,f1_score
from sklearn.metrics import auc
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.naive_bayes import GaussianNB

import keras
from keras.models import Sequential
from keras.layers import Dense

# Data Loading

In [None]:
# Loading application records into a pandas dataframe
appl_df = pd.read_csv( 'application_record.csv', index_col= 'ID' )
appl_df.head()

In [None]:
# Loading credit records into a pandas dataframe
cred_df = pd.read_csv('credit_record.csv')
cred_df.set_index('ID',inplace=True)

In [None]:
appl_df.head().T

In [None]:
cred_df.head().T

In [None]:
print( appl_df.shape )
print('')
appl_df.info()

In [None]:
print( cred_df.shape )
print('')
cred_df.info()

# Data Cleaning

## Removing duplicated data
In appl_df, althought there are different customer ID's, there is equal information over some disticnt ID values.  
An example of this is the customers with ID's 5008804 and 5008805:

In [None]:
appl_df.loc[[5008804, 5008805]].T

Using drop_duplicates in this case it's not possible, since there might be cases where one duplicate ID in application dataset might not be duplicated in credit records dataset.  
So we need to account ID's common to both dataframes 


In [None]:
valid_indexes = list( set(appl_df.index).intersection(set(cred_df.index)) ) 
len( valid_indexes )

Filtering both dataframes by common indexes

In [None]:
#appl_df_clean =  appl_df.loc[valid_indexes]
#cred_df_clean = cred_df.loc[valid_indexes]

appl_df = appl_df.loc[valid_indexes]
cred_df = cred_df.loc[valid_indexes]

In [None]:
print(appl_df.shape)
print('')
appl_df.head().T

Adding a unique customer ID in appl_df

In [None]:
appl_df_clean = appl_df.sort_values(by = appl_df.columns.to_list()) #safety step - performance doesen't seem to be affected that much.
grouped_cust = appl_df.sum(axis=1).map(hash).reset_index().rename(columns={0:'customer_id'})

id_counts_df = pd.DataFrame( grouped_cust.groupby('customer_id').size().sort_values(ascending=False), columns = ['id_count'] )
appl_df_clean['cust_id'] = appl_df.sum(axis=1).map(hash)

Adding the same unique customer ID to cred_df

In [None]:
grouped_cust = grouped_cust.set_index('ID')
cred_df_trsf = cred_df.merge(grouped_cust, how = 'inner', on = 'ID').reset_index()[['customer_id','ID', 'MONTHS_BALANCE', 'STATUS']]

cred_df_g = cred_df_trsf.sort_values(by=['customer_id', 'ID', 'MONTHS_BALANCE'], ascending = [True, True, False]).reset_index(drop=True)
cred_df_g['interaction_ID'] = cred_df_g.groupby(['customer_id','ID'], sort = False).ngroup().add(1) # not woring as I wanted, but it's good enough for what needs to be done
cred_df_g.drop(columns = ['ID'], inplace=True)
cred_df_g = cred_df_g[['customer_id', 'interaction_ID', 'MONTHS_BALANCE', 'STATUS']]
cred_df_g.head()

Transforming cred_df in order to return a list of customers labeled by theyr behaviour type.  
This will help with getting our Y label

In [None]:
cred_df_g['month_behav'] = np.where( cred_df_g.STATUS.isin(['2','3','4','5']), 'b', 'g' )

cust_beh = pd.DataFrame( round( cred_df_g.groupby(['customer_id', 'month_behav']).size() / cred_df_g.groupby(['customer_id']).size() * 100, 2), columns = ['behav_kpi']).reset_index().set_index('customer_id')
print(cust_beh.head())
bad_cust = \
cust_beh[
    ( (cust_beh.month_behav=='g') & (cust_beh.behav_kpi <= 95) ) | 
    ( (cust_beh.month_behav=='b') & (cust_beh.groupby('customer_id').size()==1) )
        ]
bad_cust['customer_type'] = 'bad'
bad_cust.drop(columns=['month_behav', 'behav_kpi'], inplace=True)

good_cust = \
    cust_beh[
        ( (cust_beh.month_behav=='g') & (cust_beh.behav_kpi > 95) ) | 
        ( (cust_beh.month_behav=='g') & (cust_beh.groupby('customer_id').size()==1) )
            ]
good_cust['customer_type'] = 'good'
good_cust.drop(columns=['month_behav', 'behav_kpi'], inplace=True)

cred_df_clean = pd.concat([bad_cust, good_cust])
cred_df_clean['months_in_book'] = cred_df_g.groupby('customer_id').size()
cred_df_clean['contracts_nr'] = cred_df_g.groupby(['customer_id'])['interaction_ID'].nunique()

## Checking and Cleaning Missing Data

In [None]:
# Checking how many values are missing in credit records dataset
cred_df_clean.isnull().sum()

In [None]:
# Checking how many values are missing in application dataset
appl_df_clean.isnull().sum()

This means we only need to clean missing data in application dataset -> Ocupation_type column.  

In [None]:
appl_df_clean.OCCUPATION_TYPE.unique()

replace the missing values by 'Not Available'.

In [None]:
appl_df_clean['OCCUPATION_TYPE'] = appl_df_clean['OCCUPATION_TYPE'].fillna('Not Available')

## Reclassifying some Flags
In application dataset there are 2 flag columns that have Y/N labels, while other flags in this dataset are binary variables.   
For congruence, let's pass those Y/N labels into 1 and 0's, respectively. 

In [None]:
dic = {
    'Y' : 1,
    'N' : 0
}

appl_df_clean['FLAG_OWN_CAR'] = appl_df_clean['FLAG_OWN_CAR'].replace(dic)
appl_df_clean['FLAG_OWN_REALTY'] = appl_df_clean['FLAG_OWN_REALTY'].replace(dic)

appl_df_clean.head()

In [None]:
flag_cols = [x for x in appl_df_clean.columns if x.startswith('FLAG_')]                                                          
cat_cols  = [x for x in appl_df_clean.columns if x.startswith('CODE_') or x.startswith('NAME_') or x.startswith('OCCUPATION_') ] 
num_cols  = [x for x in appl_df_clean.columns if x.startswith('CNT_')] + [x for x in appl_df_clean.columns if x.startswith('AMT_') or x.startswith('DAYS_')]          

#checkzone:
# 
# +1 accounts for cust_id column who does not fit in any column category because it's a "dummy" column just to join with cred_df later on
#
len(flag_cols) + len( cat_cols ) + len( num_cols ) + 1 == len( appl_df_clean.columns )  

In [None]:
appl_df_clean = appl_df_clean[flag_cols + cat_cols + num_cols + ['cust_id']]

appl_df_clean.head().T

## Joining application records dataset with customer labels dataset

In [None]:
cred_df_clean.reset_index(inplace=True)

df = \
appl_df_clean.reset_index().merge(
    cred_df_clean,
    left_on = appl_df_clean.cust_id,
    right_on = cred_df_clean.customer_id,
    how = 'inner'
).drop(columns = ['key_0','cust_id', 'customer_id']).set_index('ID')

df

# Descriptive Analytics

Now that we have our datasets cleaned we can prepare some descriptive analytics about them

## Applications Dataset

### Flag features distributions

In [None]:
fig, axes = plt.subplots(ncols=len( flag_cols ), figsize=(20,5))
for col, ax in zip(df[flag_cols], axes):
    df[col].value_counts().sort_values().plot.barh(ax=ax, title=col + ' histogram')

plt.tight_layout()    
plt.show()

From the above chart sequence we can imediatly see that all the customers, recorded in the application dataset, have a mobile phone.  
This fact implies that it is irrelevant to use this feature for deault modeling, since it won't help to find differences amongst customers.  

Other fact to note, is that in every flag feature, except FLAG_MOBIL, there is a notable difference between customers who have the caracteristic and the ones who don't.  

In [None]:
#removing FLAG_MOBIL from the app_df_clean dataset since it holds no value for this study
df.drop(columns=['FLAG_MOBIL'], inplace=True)
#removing FLAG_MOBIL from flag_cols aswell
flag_cols.remove('FLAG_MOBIL')

df.head()

### Categorical Features Distributions

In [None]:
fig, axes = plt.subplots( nrows= len(cat_cols),  figsize=(10,20))
for col, ax in zip(df[cat_cols], axes):
    df[df['customer_type']=="bad"][col].value_counts().sort_values().plot.barh(ax=ax, title=col + ' histogram')

plt.tight_layout()    
plt.show()

From the above charts we can immediately notice that there are notable differences in all categorical features.  


### Numeric Features Distributions

In [None]:
num_cols = num_cols + ['months_in_book', 'contracts_nr']

In [None]:
fig, axes = plt.subplots( nrows= len(num_cols), figsize=(10,20))

for col, ax in zip(df[num_cols], axes):
    sns.distplot( df[col],ax=ax ,kde = False, fit=norm)

plt.tight_layout()    
plt.show()
;

Analysing the numeric features distribution plots, we can imediatly see that all of this features are assimetric over the lower values, except DAYS_BITH.  
  
This says alot about the majority of the profiles:  
Most customers have:   
- none or few children,  
- have small families,  
- have low incomes,  
- have about 43 years old  
- are emplyoed for not too long, although there is a significant proportion who are unemployed (those positive days are way too high!!! ) 


From here we can also see that we, probably, will have serious problems with outliers lying in the data.  

### Numeric features | Outliers detection

In [None]:
desc_num = round( df[num_cols].describe(), 0)
desc_num    

  
365,243 days unemployed translates into about 1001 years unemployed! (according to data dic, positive values for this feature mean the person is unemployed)  
This is impossible 
  
Actually, if we filter app_df_clean by positive values in DAYS_EMPLOYED and check the minimum value for the resulting DAYS_EMPLOYED sample, we get 365243 days  
which means that all of the positive values in app_df_clean are, most defenetly, wrongly registered.  


In [None]:
df[df.DAYS_EMPLOYED > 0 ]['DAYS_EMPLOYED'].min()

In [None]:
appl_df[appl_df.DAYS_EMPLOYED > 0 ]['DAYS_EMPLOYED'].min()

 This leave us with no idea for how long people are unemployed 

checking who are the customers who have that much time unemployed...

In [None]:
appl_df_unem = df[appl_df_clean.DAYS_EMPLOYED > 0 ]
appl_df_unem.head().T

Seems like they are mostly pensionists...  
Let's confirm that fact:

In [None]:
print( appl_df_unem['NAME_INCOME_TYPE'].unique() )
print('')
print( appl_df_unem['OCCUPATION_TYPE'].unique() )

In [None]:
print( appl_df[appl_df.DAYS_EMPLOYED == 365243]['NAME_INCOME_TYPE'].unique() )
print('')
print(appl_df[appl_df.DAYS_EMPLOYED == 365243]['OCCUPATION_TYPE'].unique())

So... this lead us to conclude that whoever built this dataframe, used 365,243 days figure to register pensionists who have no occupation.  
Our model will need to understand that people are not employed (and neither they are employed) because they are already pensionists.

In [None]:
df[df['customer_type']=='bad'].describe()

In [None]:
df[df['customer_type']=='good'].describe()

Let's plot some box-whiskers plots in ordert o see better the outliers in our data samples

In [None]:
fig, axes = plt.subplots( nrows= len(num_cols), figsize=(20,40))

for col, ax in zip(df[num_cols], axes):
    sns.boxplot( y = df[col], ax=ax, orient = 'h' )

plt.tight_layout()    
plt.show()

Results - 
there are alot of outliers in most of numeric features of our dataset  



### Testing the "Normality" of DAYS_BIRTH feature - to get whether we can use pearson correlation or not(that is whether this plot is statistical or not)  

In [None]:
#Normality Test
stat, p = normaltest(df['DAYS_BIRTH'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Sample looks Gaussian (fail to reject H0)')
else:
	print('Sample does not look Gaussian (reject H0)')

Safe to conclude that DAYS_BIRTH feature is not coming from a purely Gaussian/Normal probability distribution. 

### Numeric Feature correlations

In [None]:
appl_corr = round(df[num_cols].corr('kendall'), 2)

plt.figure(figsize=(10, 8))

mask = np.zeros_like(appl_corr)
mask[np.triu_indices_from(mask)] = True

ax = sns.heatmap(
    appl_corr, 
    annot=True, 
    square=True,
    mask=mask,
    xticklabels=True, 
    yticklabels=True    
    )

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
ax.set_ylim([0,7])
plt.show()
;

The follwing attributes have strong relationships: 
* CNT_FAM_MEMBERS and CNT_CHILDREN,   

   

In [None]:
plt.scatter(df['CNT_CHILDREN'], df['CNT_FAM_MEMBERS']);

We'll drop CNT_CHILDREN or CNT_FAM_MEMBERS given the high correlations

In [None]:
num_cols.remove('CNT_CHILDREN')
num_cols.remove('contracts_nr')
df.drop(columns=['CNT_CHILDREN','contracts_nr'], inplace = True)
df.head()

## Credit Records Data Set

In [None]:
cred_df['STATUS'].value_counts().sort_values().plot.barh(title= 'STATUS histogram')

plt.tight_layout()    
plt.show()

From the plot we can see that most customers-
1.that have loans to pay on a given month
2.pay them on time with max 59 days overdue.  

However, there is a slight portion of them that either:  
1.have bad debts
2.pay within 60 or 149 days overdue.  

Those last ones can be classified as the 'Bad' customers, because if we think (as a bank) of not receiveing the due amounts for more then 60 days it will have a very negative impact for bank.

Below dataframe give us the customers labeled as 'good' or 'bad' based on classification done previously

In [None]:
cred_df_clean.head()

Percentage of bad clients in the data:

In [None]:
percent_bad_customers = 100 * len( cred_df_clean[cred_df_clean.customer_type=='bad'] ) / cred_df_clean.shape[0] 

print( "{0:.3f}%".format( percent_bad_customers) )

In [None]:
#example of a "bad" customer
customer_id = 5142361
print( appl_df_clean.loc[customer_id] )
print('')
print( cred_df.loc[customer_id].sort_values(by=['MONTHS_BALANCE'], ascending=False) )

## Shortening Categorical Feature Classes

In [None]:
cat_cols

In [None]:
# Variable list to transform
cat_col_t = [ col for col in cat_cols if 'GENDER' not in col ] # we exclude gender as it only contains 2 categories
cat_col_t

### NAME_INCOME_TYPE

In [None]:
df['NAME_INCOME_TYPE'].value_counts()

Let's divide this variable as 'Working', 'Pensionioner' and 'Student', making 'State servant', 'Commercial associate' merge into 'Working' category. 

In [None]:
dic = {
    'Commercial associate' : 'Working',
    'State servant' : 'Working',
}
df['NAME_INCOME_TYPE'] = df['NAME_INCOME_TYPE'].replace(dic)

### NAME_EDUCATION_TYPE

In [None]:
df['NAME_EDUCATION_TYPE'].value_counts()

Let's divide this variable as 'Secondary / secondary special', 'Higher education' and 'Lower secondary'.

In [None]:
dic = {
    'Incomplete higher' : 'Secondary / secondary special',
    'Academic degree' : 'Higher education',
    'Lower secondary' : 'Basic'
}
df['NAME_EDUCATION_TYPE'] = df['NAME_EDUCATION_TYPE'].replace(dic)

### NAME_FAMILY_STATUS

In [None]:
df['NAME_FAMILY_STATUS'].value_counts()

In [None]:
dic = {
    'Civil marriage' : 'Married'
}

#df['NAME_FAMILY_STATUS'].replace(dic).value_counts()
df['NAME_FAMILY_STATUS'] = df['NAME_FAMILY_STATUS'].replace(dic)

### NAME_HOUSING_TYPE

In [None]:
df['NAME_HOUSING_TYPE'].value_counts()

In [None]:
dic = {
    'House / apartment' : 'Rented apartment',
    'Co-op apartment' : 'Rented apartment',
    'Municipal apartment': 'Municipal or Office apartment',
    'Office apartment': 'Municipal or Office apartment'
}

df['NAME_HOUSING_TYPE'] = df['NAME_HOUSING_TYPE'].replace(dic)

### OCCUPATION_TYPE

In [None]:
df['OCCUPATION_TYPE'].value_counts()

grouping professions type will be though without any grouping criteria.  
Let's try to group by income.  

In [None]:

data = pd.DataFrame(df.groupby(['OCCUPATION_TYPE'])['AMT_INCOME_TOTAL'].mean()).reset_index().sort_values(by=['AMT_INCOME_TOTAL'], ascending=False)

plt.figure(figsize=(20,5))
ax = sns.boxplot(data=df.sort_values(by='AMT_INCOME_TOTAL', ascending = False), x='OCCUPATION_TYPE', y='AMT_INCOME_TOTAL', order = data['OCCUPATION_TYPE'], linewidth= 1)

ax.set_xticklabels(ax.get_xticklabels(),rotation=85)

#ax.set_xlim(0,20)
;

In [None]:
dic = {
    'Managers' : 'Group 1',
    'Realty agents' : 'Group 1',
    'Drivers' : 'Group 1',
    'Accountants' : 'Group 1',
    'IT staff' : 'Group 2',
    'Private service staff' : 'Group 2',
    'High skill tech staff' : 'Group 2',
    'HR staff' : 'Group 2',
    'Core staff' : 'Group 2',
    'Laborers' : 'Group 3',
    'Security staff' : 'Group 3',
    'Sales staff' : 'Group 3',
    'Not Available' : 'Group 3',
    'Secretaries' : 'Group 3',
    'Medicine staff' : 'Group 4',
    'Waiters/barmen staff' : 'Group 4',
    'Cleaning staff' : 'Group 4',
    'Cooking staff' : 'Group 4',
    'Low-skill Laborers' : 'Group 4'
}

#df['OCCUPATION_TYPE'].replace(dic)
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].replace(dic)

In [None]:
data = pd.DataFrame(df.groupby(['OCCUPATION_TYPE'])['AMT_INCOME_TOTAL'].mean()).reset_index().sort_values(by=['AMT_INCOME_TOTAL'], ascending=False).round(1)

plt.figure(figsize=(8,6))
ax = sns.boxplot(data=df.sort_values(by='AMT_INCOME_TOTAL', ascending = False), x='OCCUPATION_TYPE', y='AMT_INCOME_TOTAL', order = data['OCCUPATION_TYPE'], linewidth= 1)

ax.set_xticklabels(ax.get_xticklabels(),rotation=85);

# Data Pre Processing

In this section we are going to pre-process the dataset to feed the model.  
We will split features by theyr type first: flag (or binary), numeric and categorical, and in each of those splits we are doing train and test splits to guarantee that numeric features are not scaled/normalized before the train/test split operation.

## Encoding Categorical Features

In [None]:
df_cat = df[cat_cols]
df_dumm = pd.get_dummies(df_cat,  prefix_sep='==')

df_dumm.head()

Dropping categorical columns from df dataframe, and joining the "dummy" versions

In [None]:
df.drop(columns = cat_cols, inplace= True)

In [None]:
df_dumm.head()

In [None]:
df = pd.concat([df_dumm, df], axis = 1)
df.head()

## Setting Flag Features to same data type

In [None]:
df[flag_cols] = df[flag_cols].astype('uint8')
df.head()
df[num_cols].describe()

## Normalisation

In [None]:
df_num= df[num_cols].drop('DAYS_EMPLOYED',axis=1)

# perform a robust scaler transform of the dataset
trans = RobustScaler(with_centering=False, with_scaling=True)

df_num_s = trans.fit_transform(df_num)
df_num = pd.DataFrame( df_num_s, columns = df_num.columns, index = df.index)

#separate normalisation for unemployed days
employement = df['DAYS_EMPLOYED']
employement = [x/pow(10,4) if x<50000 else 1 for x in employement]

# for feeding into model
df.drop(columns = num_cols, inplace = True)
df = pd.concat([df, df_num,pd.DataFrame(employement,columns=["DAYS_EMPLOYED"],index=df.index)] ,axis=1)
df[num_cols].describe()

## Dealing with class imbalance using SMOTE

### First Oversampling then train test split or otherwise?

In [None]:
print(df['customer_type'].value_counts())

In [None]:
# genrating target class
Y=df['customer_type']
Y=pd.Series([1 if z=='bad' else 0 for z in Y ],index=Y.index)
X=df.drop('customer_type',axis=1)
Y = Y.astype('int')

rand_st = 123
test_size =0.3

# splitting into test and train
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = test_size, stratify = Y, random_state = rand_st )
print(Y_train.value_counts())
X_train,Y_train = SMOTE().fit_sample(X_train,Y_train)
X_train = pd.DataFrame(X_train, columns = X.columns)

print(Y_train.value_counts())

In [None]:
X_train.describe()

# Modelling and results

#### Our data set is highly imbalabced. Hence, accuracy is not the correct performance measure for our model.
#### Precision or recall measures are used depeding upon the situation of the bank
#### There is usually a trade-off between precision and recall
#### A bank having plethora of customers will prefer a higher recall model while a bank with less number of customers will go for higher precision model
#### F1 score which is harmonic mean of precision and recall is used for overall model quality assesment
### Below models are listed by increasing F1 score

## Naive Bayes

In [None]:
model = GaussianNB()
model.fit(X_train,Y_train)

In [None]:
Y_predict = model.predict(X_test)
train_predict = model.predict(X_train)
print('Accuracy Score is {:.5}'.format(accuracy_score(Y_test, Y_predict)))
print('Accuracy Score is {:.5}'.format(accuracy_score(Y_train, train_predict)))

In [None]:
cm = confusion_matrix(Y_test, Y_predict)
sns.heatmap(cm,annot=True,fmt='g',cmap="icefire", linewidths=1, linecolor='black')

In [None]:
print(classification_report(Y_test, Y_predict))
print(classification_report(Y_train, train_predict))

In [None]:
# plotting ROC curve
Y_score = model.predict_proba(X_test)[:,1]
false_positive_rate_nbc, true_positive_rate_nbc, threshold = roc_curve(Y_test, Y_score)
print('roc_auc_score for Naive-bayes: ', roc_auc_score(Y_test, Y_score))


Reasons for poor performance - 
1. The class conditional independence is not a good assumption in this case
2. The prediction in best with if/else rules rather than simply dealing with features independently
#### This model's prediction is different from other models as here it classifies most customers as bad. This is probably because bad customer class conditioned probability is high for a few features i.e
##### P(count_children=1/customer=bad)>>P(count_children=1/customer=good)

# Regression Model

In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, Y_train)

In [None]:
Y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))
train_pred = logreg.predict(X_train)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(logreg.score(X_train, Y_train)))

In [None]:
cm = confusion_matrix(Y_test, Y_pred)
print(sns.heatmap(cm,annot=True,fmt='g',cmap="icefire", linewidths=1, linecolor='black'))

In [None]:
print(classification_report(Y_test, Y_pred))
print(classification_report(Y_train, train_pred))

In [None]:
# plotting ROC curve
Y_score = logreg.predict_proba(X_test)[:,1]
false_positive_rate_lr, true_positive_rate_lr, threshold = roc_curve(Y_test, Y_score)
print('roc_auc_score for Logistic Regression: ', roc_auc_score(Y_test, Y_score))


Not very good results

# Decision tree

In [None]:
max_dep=0
for i in range(1,100):
    model = DecisionTreeClassifier(max_depth=i,
                               min_samples_split=8,
                               random_state=1024)
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
    if f1_score(Y_test, Y_predict)>max_dep+0.1:
        max_dep=f1_score(Y_test, Y_predict)
        tuned_depth=i
print(max_dep,tuned_depth)

In [None]:
max_dep=0
for i in range(2,100):
    model = DecisionTreeClassifier(max_depth=tuned_depth,
                               min_samples_split=i,
                               random_state=1024)
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
    if f1_score(Y_test, Y_predict)>max_dep+0.1:
        max_dep=f1_score(Y_test, Y_predict)
        tuned_split=i
print(max_dep,tuned_split)

In [None]:
model = DecisionTreeClassifier(max_depth=tuned_depth,
                               min_samples_split=tuned_split,
                               random_state=1024)
model.fit(X_train, Y_train)


In [None]:
Y_predict = model.predict(X_test)
train_predict=model.predict(X_train)
print('Accuracy Score is {:.5}'.format(accuracy_score(Y_test, Y_predict)))
print('Accuracy Score is {:.5}'.format(accuracy_score(Y_train,train_predict)))

In [None]:
cm = confusion_matrix(Y_test, Y_predict)
print(sns.heatmap(cm,annot=True,fmt='g',cmap="icefire", linewidths=1, linecolor='black'))

In [None]:
print(classification_report(Y_test, Y_predict))
print(classification_report(Y_train, train_predict))

In [None]:
# plotting ROC curve
Y_score = model.predict_proba(X_test)[:,1]
false_positive_rate_dt, true_positive_rate_dt, threshold = roc_curve(Y_test, Y_score)
print('roc_auc_score for DecisionTree: ', roc_auc_score(Y_test, Y_score))



# Random Forest 

In [None]:
f1_max=0
for i in range(1,10):
    model = RandomForestClassifier(n_estimators=250,
                              max_depth=tuned_depth,
                              min_samples_leaf=i
                              )
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
    if f1_score(Y_test, Y_predict)>f1_max+0.1:
        f1_max=f1_score(Y_test, Y_predict)
        tuned_leaf=i
print(f1_max,tuned_leaf)

In [None]:
model = RandomForestClassifier(n_estimators=250,
                              max_depth=tuned_depth,
                              min_samples_leaf=tuned_leaf
                              )
model.fit(X_train, Y_train)


In [None]:
Y_predict = model.predict(X_test)
Train_predict=model.predict(X_train)
print('Accuracy Score is {:.5}'.format(accuracy_score(Y_test, Y_predict)))
print('Accuracy Score is {:.5}'.format(accuracy_score(Y_train, Train_predict)))

In [None]:
cm = confusion_matrix(Y_test, Y_predict)
sns.heatmap(cm,annot=True,fmt='g',cmap="icefire", linewidths=1, linecolor='black')


In [None]:
print(classification_report(Y_test, Y_predict))
print(classification_report(Y_train, Train_predict))

In [None]:
# plotting ROC curve
Y_score = model.predict_proba(X_test)[:,1]
false_positive_rate_rf, true_positive_rate_rf, threshold = roc_curve(Y_test, Y_score)
print('roc_auc_score for Random Forest: ', roc_auc_score(Y_test, Y_score))


# Implementing ANN

In [None]:
classifier = Sequential()
# input layer and first layer
classifier.add(Dense(units=2*30/3, kernel_initializer="uniform", activation = 'relu', input_dim = 29))
# second layer
classifier.add(Dense(units=2*21/3, kernel_initializer="uniform", activation = 'relu'))
# third layer
classifier.add(Dense(units=2*15/3, kernel_initializer="uniform", activation = 'relu'))
#output layer
classifier.add(Dense(units=1, kernel_initializer="uniform", activation = 'sigmoid'))

In [None]:
#compiling
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
classifier.fit(X_train, Y_train, batch_size = 100, epochs = 40)

In [None]:
Y_predict = classifier.predict(X_test)
Y_predict = (Y_predict > 0.5)
Y_predict=[1 if x==True else 0 for x in Y_predict]
Y_predict_train = classifier.predict(X_train)
Y_predict_train = (Y_predict_train > 0.5)
Y_predict_train=[1 if x==True else 0 for x in Y_predict_train]

print('Accuracy Score is {:.5}'.format(accuracy_score(Y_test, Y_predict)))
print('Accuracy Score is {:.5}'.format(accuracy_score(Y_train, Y_predict_train)))
cm = confusion_matrix(Y_test, Y_predict)
sns.heatmap(cm,annot=True,fmt='g',cmap="icefire", linewidths=1, linecolor='black')


In [None]:
print(classification_report(Y_test, Y_predict))
print(classification_report(Y_train, Y_predict_train))

In [None]:
# for plotting ROC curve
y_pred_keras = classifier.predict(X_test).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(Y_test, y_pred_keras)

# AdaBoost

In [None]:
classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=tuned_depth),
    n_estimators=200
)
classifier.fit(X_train, Y_train)

In [None]:
Y_predict = classifier.predict(X_test)
print('Accuracy Score is {:.5}'.format(accuracy_score(Y_test, Y_predict)))
Y_predict_train = classifier.predict(X_train)
print('Accuracy Score is {:.5}'.format(accuracy_score(Y_train, Y_predict_train)))

In [None]:
cm = confusion_matrix(Y_test, Y_predict)
sns.heatmap(cm,annot=True,fmt='g',cmap="icefire", linewidths=1, linecolor='black')


In [None]:
print(classification_report(Y_test, Y_predict))
print(classification_report(Y_train, Y_predict_train))

In [None]:
# plotting ROC curve
Y_score = classifier.predict_proba(X_test)[:,1]
false_positive_rate_ab, true_positive_rate_ab, threshold = roc_curve(Y_test, Y_score)
print('roc_auc_score for Adaboost: ', roc_auc_score(Y_test, Y_score))

# Comparing Models using ROC curves

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate_lr, true_positive_rate_lr, label = "Regression")
plt.plot(false_positive_rate_nbc, true_positive_rate_nbc, label = "Naive bayes")
plt.plot(false_positive_rate_dt, true_positive_rate_dt,label = "Decision Tree")
plt.plot(false_positive_rate_rf, true_positive_rate_rf,label = "Random Forest")
plt.plot(fpr_keras, tpr_keras,label = "ANN")
plt.plot(false_positive_rate_ab, true_positive_rate_ab,label = "AdaBoosting")
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend()
plt.show()

   # Thank you