In [None]:
#Importing the required modules

In [None]:
import pandas as pd
import pandas_profiling
import numpy as np
import missingno as mngo
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
from category_encoders.target_encoder import TargetEncoder
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif,mutual_info_classif,chi2,f_regression
import seaborn as sns
from scipy.stats.mstats import winsorize
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import KFold,cross_val_score,StratifiedKFold,RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from joblib import Parallel, delayed
!pip install imblearn
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


In [None]:
## Importing the training and test dataset
## Indicating the dependent variable
## forming a cols list containing the continuous numeric variables
## This list will be used in future codes

In [None]:
import os
os.listdir('/kaggle/input/')
data=pd.read_csv("../input/home-credit-default-risk/application_train.csv")
target_y=data['TARGET']
target_y
cols=['EXT_SOURCE_1','EXT_SOURCE_2', 'EXT_SOURCE_3','AMT_CREDIT','AMT_INCOME_TOTAL','AMT_GOODS_PRICE','AMT_ANNUITY', 'DAYS_REGISTRATION','DAYS_EMPLOYED','DAYS_BIRTH','DAYS_LAST_PHONE_CHANGE']
data.shape

In [None]:
data_test=pd.read_csv("../input/home-credit-default-risk/application_train.csv")
data_test.shape

In [None]:
##Creating a function for removing columns with more than 50 percent of missing values 
## Except EXT_SOURCE_1 which is an important variable for us

In [None]:
def missing_values(xdata):
    b=pd.DataFrame(xdata.isna().sum())
    b.rename(columns={0:'no. of missing values'},inplace=True)
    b['percent of missing values']=b['no. of missing values']/len(xdata)
    b=b.sort_values(by='no. of missing values',ascending=False)
    b.reset_index(level=0,inplace=True)
    b.rename(columns={'index':'feature_name'},inplace=True)
    plt.figure(figsize=(16,12))
    plt.plot(b['feature_name'].head(30),b['percent of missing values'].head(30))
    plt.xticks(rotation=90)
    plt.show()
    c=b[(b['feature_name']!='EXT_SOURCE_1') & (b['percent of missing values']<=0.50)]
    list1=list(c['feature_name'])
    list1.append('EXT_SOURCE_1')
    
    data_removing_missing=xdata.loc[:,list1]
    return data_removing_missing
    
    

In [None]:
## Applying the function on test and train datasets

In [None]:
train_data_removing_missing=missing_values(data)
train_data_removing_missing
test_data_removing_missing=missing_values(data_test)
test_data_removing_missing

In [None]:
##Shape of test and train datasets after applying the above function

In [None]:
print(test_data_removing_missing.shape)
print(train_data_removing_missing.shape)

In [None]:
## A function for creating a distribution for all the continuous numeric variables
## We can see the graphs to see whether the variables and normally distributed and
## if there are any outliers in the variables

In [None]:
def continuous_variable_distributions(xdata,plot_type):
    
    for cols in xdata:
        
        if plot_type=='displot':
            sns.displot(data=xdata,x=cols,bins=25)
            plt.xticks(rotation=45)

In [None]:
## We can see that some variables are not normally distributed and 
## particularly in the days employed variable we can see aan outlier
## value with a considerable frequency. We shall investigate later

In [None]:
continuous_variable_distributions(train_data_removing_missing[cols],'displot')

In [None]:
## The pandas profile report gives a detailed analysis
## of the variables denoting the missing values in each variable,
## the correlation and interaction between different variables
## along with a univariate plot of the variables
## We can see significant number of missing values in variables
## like EXT_SOURCE_1,EXT_SOURCE_2. We will treat them.

In [None]:
train_data_removing_missing[cols].profile_report()

In [None]:
## This plot using missingno module shows the relationship
## between the missing values of different variables. As we can
## see the white lines represent missing values and they are largely 
## present in ext_source_1 and ext_source3. This plot can help us 
## to see if the missing values in ext_source_1 and ext_source3 are 
## correlated. We have to see whether missing values are random or 
## their appearance can be explained or can not be explained.

In [None]:
train_data_numerical=train_data_removing_missing[cols]
mngo.matrix(train_data_numerical)
sorted1=train_data_numerical.sort_values(by='EXT_SOURCE_1')
mngo.dendrogram(train_data_numerical)
mngo.heatmap(train_data_numerical)

In [None]:
##Creating two flag variables to see which rows 
## are missing in ext_source_1 and ext_source_3.
## As we impute the missing values and these variables
## have significant proportion of missing values we 
## might to save this information as a flag column and 
## see if it can be an important feature later.
    

In [None]:
train_data_removing_missing['missing_ext_source_1'] = train_data_removing_missing['EXT_SOURCE_1'].apply(lambda x: 0 if pd.isnull(x)==True else 1)
train_data_removing_missing['missing_ext_source_3'] = train_data_removing_missing['EXT_SOURCE_3'].apply(lambda x: 0 if pd.isnull(x)==True else 1)
test_data_removing_missing['missing_ext_source_1'] = test_data_removing_missing['EXT_SOURCE_1'].apply(lambda x: 0 if pd.isnull(x)==True else 1)
test_data_removing_missing['missing_ext_source_3'] = test_data_removing_missing['EXT_SOURCE_3'].apply(lambda x: 0 if pd.isnull(x)==True else 1)

In [None]:
## Imputing all the missing values in continuous variables
## in both training and test dataset with median of the columns 
## in the training dataset so that there is no data leakage.
## Median is robust to outliers

In [None]:
cols=['EXT_SOURCE_1','EXT_SOURCE_3','AMT_CREDIT','AMT_INCOME_TOTAL','AMT_GOODS_PRICE','AMT_ANNUITY', 'DAYS_REGISTRATION','DAYS_BIRTH','DAYS_EMPLOYED','DAYS_LAST_PHONE_CHANGE']

imp=SimpleImputer(missing_values=np.nan,strategy='median')
a=train_data_removing_missing[cols]
b=test_data_removing_missing[cols]
imp.fit(a)
data=pd.DataFrame(imp.transform(a))
data.rename(columns=dict(zip(data.columns,cols)),inplace=True)
x=train_data_removing_missing.drop(cols,axis=1)
train_data_removing_missing=pd.concat([data,x],axis=1)
data2=pd.DataFrame(imp.transform(b))
data2.rename(columns=dict(zip(data2.columns,cols)),inplace=True)
y=test_data_removing_missing.drop(cols,axis=1)
test_data_removing_missing=pd.concat([data2,y],axis=1)
print(test_data_removing_missing.shape)
print(train_data_removing_missing.shape)

In [None]:
## Checking whether the continuous variables have missing 
## values now after doing missing value imputation

In [None]:
print((train_data_removing_missing[cols]).isna().sum())
print((test_data_removing_missing[cols]).isna().sum())

In [None]:
## Now creating a function to detect the proportion of outliers
## in the continuous variables. Using two criteria to find the proportion
## First criteria:- twenty fifth percentile -1.5*IQR,seventy fifth percentile +1.5*IQR
## Second Criteria:-twenty fifth percentile -3*IQR,seventy fifth percentile +3*IQR
## Outliers according to first criteria we call light outliers and outliers according
## to second criteria we call heavy outliers

In [None]:
def outlier_detection(cols):
    
    dict_extreme={}
    dict_light={}
    
    for col in cols:
        q1=train_data_removing_missing[col].quantile(0.25)
        q3=train_data_removing_missing[col].quantile(0.75)
        left_boundary_extreme=q1-3*(q3-q1)
        right_boundary_extreme=q3+3*(q3-q1)
        left_boundary_light=q1-1.5*(q3-q1)
        right_boundary_light=q3+1.5*(q3-q1)
        outlier_list_extreme=[]
        outlier_list_light=[]
        for index,x in enumerate(train_data_removing_missing[col]):
            if (x<left_boundary_extreme) or (x>right_boundary_extreme):
                outlier_list_extreme.append(index)
            if (x<left_boundary_light) or(x>right_boundary_light):
                outlier_list_light.append(index)
        dict_extreme[col]=len(outlier_list_extreme)/len(train_data_removing_missing)
        dict_light[col]=len(outlier_list_light)/len(train_data_removing_missing)
    return dict_extreme,dict_light
        

In [None]:
## Converting all the continuous day variables into years.
## We convert days into positive and divide by 365 to get years

In [None]:
train_data_removing_missing['YEARS_EMPLOYED']=-1*(train_data_removing_missing['DAYS_EMPLOYED'])/365
train_data_removing_missing['YEARS_REGISTRATION']=-1*(train_data_removing_missing['DAYS_REGISTRATION'])/365
train_data_removing_missing['YEARS_BIRTH']=-1*(train_data_removing_missing['DAYS_BIRTH'])/365
train_data_removing_missing['YEARS_LAST_PHONE_CHANGE']=-1*(train_data_removing_missing['DAYS_LAST_PHONE_CHANGE'])/365
test_data_removing_missing['YEARS_EMPLOYED']=-1*(test_data_removing_missing['DAYS_EMPLOYED'])/365
test_data_removing_missing['YEARS_REGISTRATION']=-1*(test_data_removing_missing['DAYS_REGISTRATION'])/365
test_data_removing_missing['YEARS_BIRTH']=-1*(test_data_removing_missing['DAYS_BIRTH'])/365
test_data_removing_missing['YEARS_LAST_PHONE_CHANGE']=-1*(test_data_removing_missing['DAYS_LAST_PHONE_CHANGE'])/365

In [None]:
## Applying outlier detection columns on the continuos variables
## we see that days employed has significant proportion of outliers
## which we saw in the graph. ext_source_1 has high proportion but the 
## range is (0,1) so we can ignore this variable

In [None]:
cols=['EXT_SOURCE_1','EXT_SOURCE_2', 'EXT_SOURCE_3','AMT_CREDIT','AMT_INCOME_TOTAL','AMT_GOODS_PRICE','AMT_ANNUITY', 'DAYS_REGISTRATION','DAYS_EMPLOYED','DAYS_BIRTH','DAYS_LAST_PHONE_CHANGE']

outlier_detection(cols)

In [None]:
## Plot to show proportion of heavy and light outliers

In [None]:
cols=['EXT_SOURCE_1','EXT_SOURCE_2', 'EXT_SOURCE_3','AMT_CREDIT','AMT_INCOME_TOTAL','AMT_GOODS_PRICE','AMT_ANNUITY', 'DAYS_REGISTRATION','DAYS_EMPLOYED','DAYS_BIRTH','DAYS_LAST_PHONE_CHANGE']

dict_extreme,dict_light=outlier_detection(cols)
print(dict_extreme)
print(dict_light)
plt.plot(dict_extreme.keys(),dict_extreme.values())
plt.xticks(rotation=90)
plt.plot(dict_light.keys(),dict_light.values())
plt.xticks(rotation=90)
plt.show()

In [None]:
## Creating a dataframe to see the range of values
## from 90 to 99.9 percentile for all continuous variables
## This can be used in outlier imputation

In [None]:
def quantile_length(data,cols):
    quantile_list={}
    
    outlier_list=[]
    for col in cols:
        for quantile in [0.90,0.925,0.95,0.975,0.99,0.999]:
        
            outlier_list.append(data[col].quantile(quantile))
        outlier_list.append(data[col].max())
        outlier_list.append(data[col].min())
        outlier_list.append(data[col].mean())
        quantile_list[col]=outlier_list
        outlier_list=[]
            
    return quantile_list   

In [None]:
cols=['EXT_SOURCE_1','EXT_SOURCE_2', 'EXT_SOURCE_3','AMT_CREDIT','AMT_INCOME_TOTAL','AMT_GOODS_PRICE','AMT_ANNUITY', 'DAYS_REGISTRATION','DAYS_EMPLOYED','DAYS_BIRTH','DAYS_LAST_PHONE_CHANGE']
quantile_list=quantile_length(train_data_removing_missing,cols)
quantile_list_dataframe=pd.DataFrame(quantile_list,index=['90 percentile','92.5 percentile','95 percentile','97.5 percentile','99 percentile','99.9 percentile','max','min','mean'])
print(quantile_list_dataframe)
#print(train_data_removing_missing[cols].describe())
#for key,values in quantile_list.items():
 #   print(key,values)

In [None]:
## Creating an outlier variable for years employed
## we will impute the outliers in the variable with its mean
## so we create a flag to see which rows are outliers so maybe
## this feature can be useful in modelling

In [None]:
train_data_removing_missing['years_employed_outlier']=train_data_removing_missing['YEARS_EMPLOYED'].apply(lambda x:1 if x>=1000.66 else 0)
train_data_removing_missing                                                                                         

In [None]:
test_data_removing_missing['years_employed_outlier']=test_data_removing_missing['YEARS_EMPLOYED'].apply(lambda x:1 if x>=1000.66 else 0)
test_data_removing_missing                                                                                         

In [None]:
## We create an outlier replacement function.
## the values less than fifth percentile will
## be replaced by fifth percentile and values greater
## than ninety fifth percentile replaced by ninety fifth
## percentile. This range (0.05,0.95) is normally used

In [None]:
def outlier_replacement(cols):
    
    for col in cols:
        left_boundary=train_data_removing_missing[col].quantile(0.05)
        right_boundary=train_data_removing_missing[col].quantile(0.95)

        train_data_removing_missing[col]=np.where(train_data_removing_missing[col]<left_boundary,left_boundary,train_data_removing_missing[col])
        train_data_removing_missing[col]=np.where(train_data_removing_missing[col]>right_boundary,right_boundary,train_data_removing_missing[col])
        
        test_data_removing_missing[col]=np.where(test_data_removing_missing[col]<left_boundary,left_boundary,test_data_removing_missing[col])
        test_data_removing_missing[col]=np.where(test_data_removing_missing[col]>right_boundary,right_boundary,test_data_removing_missing[col])


    return(train_data_removing_missing,test_data_removing_missing) 

In [None]:
## For days employed because the values are same
## starting from ninety fifth percentile and above so any value greater than ninety
## fifth percentile will be replaced by median.
## For years employed because the values are same
## starting from fifth percentile and below so any value greater less than
## fifth percentile will be replaced by median.

In [None]:
left_boundary=train_data_removing_missing['DAYS_EMPLOYED'].quantile(0.05)
right_boundary=train_data_removing_missing['DAYS_EMPLOYED'].quantile(0.95)
median=train_data_removing_missing['DAYS_EMPLOYED'].median()
left_boundary2=train_data_removing_missing['YEARS_EMPLOYED'].quantile(0.05)
right_boundary2=train_data_removing_missing['YEARS_EMPLOYED'].quantile(0.95)
median2=train_data_removing_missing['YEARS_EMPLOYED'].median()

train_data_removing_missing['DAYS_EMPLOYED']=np.where(train_data_removing_missing['DAYS_EMPLOYED']<=left_boundary,left_boundary,train_data_removing_missing['DAYS_EMPLOYED'])
train_data_removing_missing['DAYS_EMPLOYED']=np.where(train_data_removing_missing['DAYS_EMPLOYED']>=right_boundary,median,train_data_removing_missing['DAYS_EMPLOYED'])
train_data_removing_missing['DAYS_EMPLOYED'].describe()
test_data_removing_missing['DAYS_EMPLOYED']=np.where(test_data_removing_missing['DAYS_EMPLOYED']<=left_boundary,left_boundary,test_data_removing_missing['DAYS_EMPLOYED'])
test_data_removing_missing['DAYS_EMPLOYED']=np.where(test_data_removing_missing['DAYS_EMPLOYED']>=right_boundary,median,test_data_removing_missing['DAYS_EMPLOYED'])
test_data_removing_missing['DAYS_EMPLOYED'].describe()

train_data_removing_missing['YEARS_EMPLOYED']=np.where(train_data_removing_missing['YEARS_EMPLOYED']<=left_boundary2,median2,train_data_removing_missing['YEARS_EMPLOYED'])
train_data_removing_missing['YEARS_EMPLOYED']=np.where(train_data_removing_missing['YEARS_EMPLOYED']>=right_boundary2,right_boundary2,train_data_removing_missing['YEARS_EMPLOYED'])
train_data_removing_missing['YEARS_EMPLOYED'].describe()
test_data_removing_missing['YEARS_EMPLOYED']=np.where(test_data_removing_missing['YEARS_EMPLOYED']<=left_boundary2,median2,test_data_removing_missing['YEARS_EMPLOYED'])
test_data_removing_missing['YEARS_EMPLOYED']=np.where(test_data_removing_missing['YEARS_EMPLOYED']>=right_boundary2,right_boundary2,test_data_removing_missing['YEARS_EMPLOYED'])
test_data_removing_missing['YEARS_EMPLOYED'].describe()


In [None]:
## Checking outlier proportion after applying the function
## and we see considerable drop in outliers in all continuous 
## variables

In [None]:
train_data_removing_missing,test_data_removing_missing=outlier_replacement(cols)
outlier_detection(cols)

In [None]:
## Outlier detection in years column and we see reduction 
##in outliers in the years column

In [None]:
col_new=['YEARS_REGISTRATION','YEARS_EMPLOYED','YEARS_BIRTH','YEARS_LAST_PHONE_CHANGE']
train_data_removing_missing,test_data_removing_missing=outlier_replacement(col_new)
outlier_detection(col_new)

In [None]:
## Now we see the distributions of the continuous variables
## after outlier repalcement and we see that some variables have improved
## to normal distribution. Some still have skewed distributions but in days
## employed and years employed the outliers and gone and the distribution
## is better now. We have also applied the min max scaler so as to bring
## all the variables on a consistent scale so that the model does not give 
## weights based on difference in scales

In [None]:
cols=['AMT_CREDIT','AMT_INCOME_TOTAL','AMT_GOODS_PRICE','AMT_ANNUITY', 'DAYS_REGISTRATION','DAYS_BIRTH','DAYS_LAST_PHONE_CHANGE'
     ,'DAYS_EMPLOYED','YEARS_REGISTRATION','YEARS_BIRTH','YEARS_LAST_PHONE_CHANGE','YEARS_EMPLOYED']

transformer=MinMaxScaler()
transformer.fit_transform(train_data_removing_missing[cols])
transformer.transform(test_data_removing_missing[cols])
continuous_variable_distributions(train_data_removing_missing[cols],'displot')
a=train_data_removing_missing[cols]


In [None]:
## Creating the density plots for all the continous
## variables grouped by the target variable. We want
## to see if the distribution is different for both the
## target labels which can signify that this continuous variable
## may be significant in detecting the target variable

In [None]:
sns.kdeplot(data=a, x='AMT_INCOME_TOTAL',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=a, x='AMT_GOODS_PRICE',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=a, x='AMT_ANNUITY',hue=target_y,fill=True)


In [None]:
sns.kdeplot(data=a, x='DAYS_BIRTH',hue=target_y,fill=True)


In [None]:
sns.kdeplot(data=a, x='DAYS_REGISTRATION',hue=target_y,fill=True)


In [None]:
sns.kdeplot(data=a, x='DAYS_LAST_PHONE_CHANGE',hue=target_y,fill=True)


In [None]:
sns.kdeplot(data=a, x='DAYS_EMPLOYED',hue=target_y,fill=True)


In [None]:
sns.kdeplot(data=a, x='YEARS_REGISTRATION',hue=target_y,fill=True)


In [None]:
sns.kdeplot(data=a, x='YEARS_BIRTH',hue=target_y,fill=True)


In [None]:
sns.kdeplot(data=a, x='YEARS_LAST_PHONE_CHANGE',hue=target_y,fill=True)


In [None]:
sns.kdeplot(data=a, x='YEARS_EMPLOYED',hue=target_y,fill=True)

In [None]:
##The density plots reveal not a lot so we will use 
## other methods to select the features

In [None]:
## Now for the remaining categorical variables we will 
## do target encoding. which means we replace all the 
##classes with their respective means. This converts this
##into a continuous variable which may be more helpful
##than a label or one hot encoding because this now contains
##significant information from the target labels

In [None]:
te=TargetEncoder()
train_data_removing_missing=te.fit_transform(train_data_removing_missing,target_y)
train_data_removing_missing.shape

In [None]:
## Now seeing the distribution of target encoded variables

In [None]:
target_encoded_columns=[ 'EMERGENCYSTATE_MODE',
       'OCCUPATION_TYPE', 'NAME_TYPE_SUITE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'ORGANIZATION_TYPE', 'NAME_CONTRACT_TYPE',
       'WEEKDAY_APPR_PROCESS_START', 'NAME_HOUSING_TYPE', 'NAME_FAMILY_STATUS',
       'NAME_EDUCATION_TYPE', 'NAME_INCOME_TYPE']
continuous_variable_distributions(train_data_removing_missing[target_encoded_columns],'displot')


In [None]:
## Now seeing the density plots of the target encoded variables
##with the target variable so as to see which target encoded variable
## is important

In [None]:
sns.kdeplot(data=train_data_removing_missing, x='NAME_TYPE_SUITE',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=train_data_removing_missing, x='CODE_GENDER',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=train_data_removing_missing, x='FLAG_OWN_CAR',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=train_data_removing_missing, x='FLAG_OWN_REALTY',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=train_data_removing_missing, x='ORGANIZATION_TYPE',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=train_data_removing_missing, x='NAME_CONTRACT_TYPE',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=train_data_removing_missing, x='WEEKDAY_APPR_PROCESS_START',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=train_data_removing_missing, x='NAME_HOUSING_TYPE',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=train_data_removing_missing, x='NAME_FAMILY_STATUS',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=train_data_removing_missing, x='NAME_EDUCATION_TYPE',hue=target_y,fill=True)

In [None]:
sns.kdeplot(data=train_data_removing_missing, x='NAME_INCOME_TYPE',hue=target_y,fill=True)

In [None]:
##The density plots reveal not a lot so we will use 
## other methods to select the features     
        

In [None]:
## We now take all the continuous variables
##along with the newly target encoded variables and 
##pass it through an f_classif function which a sort of an anova
## test to see which continuous variable is important for target
##variable prediction

In [None]:
['EXT_SOURCE_1',
 'EXT_SOURCE_3',
 'AMT_CREDIT',
 'AMT_INCOME_TOTAL',
 'AMT_GOODS_PRICE',
 'AMT_ANNUITY',
 'DAYS_REGISTRATION',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_LAST_PHONE_CHANGE',
 'FLOORSMAX_MODE',
 'FLOORSMAX_MEDI',
 'FLOORSMAX_AVG',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BEGINEXPLUATATION_AVG',
 'TOTALAREA_MODE',
 'EMERGENCYSTATE_MODE',
 'OCCUPATION_TYPE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'NAME_TYPE_SUITE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'EXT_SOURCE_2',
 'CNT_FAM_MEMBERS',
 'NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'REGION_POPULATION_RELATIVE',
 'NAME_HOUSING_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_EDUCATION_TYPE',
 'NAME_INCOME_TYPE',
 'YEARS_EMPLOYED',
 'YEARS_REGISTRATION',
 'YEARS_BIRTH',
 'YEARS_LAST_PHONE_CHANGE']

fs_cat_an = SelectKBest(score_func=f_classif, k=5)
m=train_data_removing_missing[cols]
fs_cat_an.fit(m, target_y)
X_train_fs_cat = fs_cat_an.transform(m)
cols=fs_cat_an.get_support()
selected_feature_anova= []
for bool,feature in zip(cols,m.columns.to_list()):
    if bool:
        selected_feature_anova.append(feature)
    
selected_feature_anova


In [None]:
##From the anova test our important variables are:-['DAYS_BIRTH','DAYS_EMPLOYED','YEARS_BIRTH','YEARS_LAST_PHONE_CHANGE','YEARS_EMPLOYED']
## We take years employed and drop days_employed because days employed is used in creating the years employed variable and similarly
## we do it for yera birth


In [None]:
## We take the features selected from anova as well as ext_source_1 and ext_source_3 because these two are 
##important variables. We selected some domain based features like amt_annuity, amt_credit, amt_income_total

In [None]:

fitted_data1=train_data_removing_missing[['EXT_SOURCE_1','EXT_SOURCE_3','YEARS_BIRTH','AMT_ANNUITY','AMT_CREDIT','AMT_INCOME_TOTAL','YEARS_EMPLOYED','YEARS_LAST_PHONE_CHANGE']] 


In [None]:
##Now trying different models starting with gradient boost 
##First we use grid search cross validation to find optimal number of 
##estimators. Our performance metric or scoring metric is roc_auc
## Let us tune some models and find the optimal hyperparameters

In [None]:
param_test1={'n_estimators': range(20,100,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(fitted_data1,target_y)

In [None]:
## Optimal number of estimators hyperparameter is 90

In [None]:
gsearch1.best_estimator_,gsearch1.best_score_,gsearch1.best_params_

In [None]:
## Now we use grid search to find optimal min_samples_split hyperparameter

In [None]:
param_test2={'min_samples_split':range(1000,2100,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(n_estimators=90,learning_rate=0.1,min_samples_leaf=50,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test2, scoring='roc_auc',n_jobs=4, cv=5)


In [None]:
## we get min_samples_split as 1800

In [None]:
gsearch2.fit(fitted_data1,target_y)
gsearch2.best_estimator_,gsearch2.best_score_,gsearch2.best_params_

In [None]:
## Now we use grid search cv to find optimal min_samples_leaf hyperparameter
## which comes out to be 30

In [None]:
param_test3={ 'min_samples_leaf':range(30,100,10)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, max_depth=8,min_samples_split=1000,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test3, scoring='roc_auc',n_jobs=-1,cv=5)
gsearch3.fit(fitted_data1,target_y)
gsearch3.best_estimator_,gsearch3.best_score_,gsearch3.best_params_

In [None]:
## Now we find the optimal subsample hyperparameter
## which is 0.9

In [None]:
param_test4={ 'subsample':[0.5,0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(n_estimators=90,learning_rate=0.1, max_depth=8,min_samples_split=1000,max_features=5,min_samples_leaf=80,random_state=10), 
param_grid = param_test4, scoring='roc_auc',n_jobs=-1, cv=5)
gsearch4.fit(fitted_data1,target_y)

In [None]:
gsearch4.best_estimator_,gsearch4.best_score_,gsearch4.best_params_
#scores=cross_val_score(gsearch5.best_estimator_,fitted_data1,target_y,cv=5,scoring='roc_auc')
#np.mean(scores)

In [None]:
##Now trying LGBM,Xgboost, Random Forest and Logistic regression 
##without any model tuning. We see that logistic model has the lowest
## auc and lgbm and xgboost have considerably high auc. But our tuned
## gradient boost model above has the highest auc compared to these 
## four models below. So we proceed with this model

In [None]:
model=LGBMClassifier()
kfold=StratifiedKFold(n_splits=10,shuffle=True,random_state=7)
model.fit(fitted_data1,target_y)
scores=cross_val_score(model,fitted_data1,target_y,cv=kfold,scoring='roc_auc')
np.mean(scores)

In [None]:
model=RandomForestClassifier()

model.fit(fitted_data1,target_y)
scores=cross_val_score(model,fitted_data1,target_y,cv=5,scoring='roc_auc')
np.mean(scores)

In [None]:
model=XGBClassifier()
kfold=StratifiedKFold(n_splits=10,shuffle=True,random_state=7)
model.fit(fitted_data1,target_y)
scores=cross_val_score(model,fitted_data1,target_y,cv=kfold,scoring='roc_auc')
np.mean(scores)

In [None]:
model=LogisticRegression()
kfold=StratifiedKFold(n_splits=10,shuffle=True,random_state=7)
model.fit(fitted_data1,target_y)
scores=cross_val_score(model,fitted_data1,target_y,cv=kfold,scoring='roc_auc')
np.mean(scores)

In [None]:
##I did not optimise max_depth any where and it may help
##The max features above I took as sqrt which is default but
## I took five here as the max_features to chose to see the difference 
##in model performance. There is not much difference between our previously
##trained model with max features parameter as sqrt and then the model 
##with five.We will go with five as of now

In [None]:
kfold=StratifiedKFold(n_splits=10,shuffle=True,random_state=7)
model=GradientBoostingClassifier(max_depth=8, max_features=5, min_samples_leaf=80,
                           min_samples_split=1000, n_estimators=90,
                           random_state=10, subsample=0.9)
scores=cross_val_score(model,fitted_data1,target_y,cv=kfold,scoring='roc_auc',n_jobs=-1)
np.mean(scores)

In [None]:
##We see from this we can see there is heavy imbalance between the classes

In [None]:
np.sum(target_y==0)/len(target_y)

In [None]:
## To mitigate the effects of class balance we use random undersampling
##and random oversampling which combines the effect of duplicating minority
##classes and removing majority classes.This must be done before cross validation
##Also for class imbalance we use stratified k fold which ensures the proportion 
##of classes is the same in the folds as in the entire training set. We see that 
##there is no increase in auc

In [None]:
over=RandomOverSampler(sampling_strategy=0.1)
under=RandomUnderSampler(sampling_strategy=0.5)
model=GradientBoostingClassifier(max_depth=8, max_features=5, min_samples_leaf=80,
                           min_samples_split=1000, n_estimators=90,
                           random_state=10, subsample=0.9)
steps=[('o',over),('u',under),('m',model)]
pipeline=Pipeline(steps=steps)
kfold=StratifiedKFold(n_splits=10,shuffle=True,random_state=7)
scores=cross_val_score(pipeline,fitted_data1,target_y,cv=kfold,scoring='roc_auc')
np.mean(scores)


In [None]:
## With random oversampling and random undersampling applied 
##along with repeated stratified kfold we see no increase in auc

In [None]:
over=RandomOverSampler(sampling_strategy=0.1)
under=RandomUnderSampler(sampling_strategy=0.5)
model=GradientBoostingClassifier(max_depth=8, max_features=5, min_samples_leaf=80,
                           min_samples_split=1000, n_estimators=90,
                           random_state=10, subsample=0.9)
steps=[('o',over),('u',under),('m',model)]
pipeline=Pipeline(steps=steps)
kfold=RepeatedStratifiedKFold(n_splits=10,random_state=7,n_repeats=3)
scores=cross_val_score(pipeline,fitted_data1,target_y,cv=kfold,scoring='roc_auc')
np.mean(scores)

In [None]:
## Trying random over sampling and random under sampling
## with repeated stratified kfold on lgbm and xgboost models
## We again see these two models are lower in auc performance
## than our gradient boost model we picked above.

In [None]:
over=RandomOverSampler(sampling_strategy=0.1)
under=RandomUnderSampler(sampling_strategy=0.5)
model=LGBMClassifier()
steps=[('o',over),('u',under),('m',model)]
pipeline=Pipeline(steps=steps)
kfold=RepeatedStratifiedKFold(n_splits=10,random_state=7,n_repeats=3)
scores=cross_val_score(pipeline,fitted_data1,target_y,cv=kfold,scoring='roc_auc')
np.mean(scores)

In [None]:
over=RandomOverSampler(sampling_strategy=0.1)
under=RandomUnderSampler(sampling_strategy=0.5)
model=XGBClassifier()
steps=[('o',over),('u',under),('m',model)]
pipeline=Pipeline(steps=steps)
kfold=RepeatedStratifiedKFold(n_splits=10,random_state=7,n_repeats=3)
scores=cross_val_score(pipeline,fitted_data1,target_y,cv=kfold,scoring='roc_auc')
np.mean(scores)

In [None]:
## We try to optimise max_depth in xgboost classifier
## using grid search cv and this is the final model we try 
## Again this model is lower in auc performance with our gradient
## boost model

In [None]:
test_params = {
 'max_depth':[4,6,8,10,12]
}
model=XGBClassifier()

model = GridSearchCV(estimator = model,param_grid = test_params,scoring='roc_auc')
model.fit(fitted_data1,target_y)
model.best_params_,model.best_score_

In [None]:
## So the final model selected has auc score of 0.71559
## Our gradient boost model has the following parameters:-
## max_depth=8, max_features=5, min_samples_leaf=30,min_samples_split=1800, n_estimators=90,subsample=0.9
## We do stratified k fold cross vaidation without any random oversampling or undersampling 

In [None]:
##I took major inspiration from this notebook:
## https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction

In [None]:
## Please comment on what can be done to improve the model performance
## This is my first notebook so any advise on how to improve is welcomed