In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Helper Function's
    These are all the funtions used in the model preparation and model deployment

In [2]:
# Helper Methods
def checking_missing_values(data):
    count = data.isnull().sum().sort_values(ascending=False)
    percentage = ((data.isnull().sum()/len(data)*100)).sort_values(ascending=False)
    missing_data = pd.concat([count, percentage], axis=1, keys=['Count','Percentage'])
    return missing_data

# Dropping the Missing values if greater than 40%
def dropping_missing_values(data):
    percentage = 40.0
    min_count = int(((100-percentage)/100)*data.shape[0] + 1)
    dropped_data = data.dropna(axis = 1, thresh = min_count)
    return dropped_data

# Checking for Duplicate data
def checking_duplicate_data(data):
    columns = []
    for col in data.columns:
        if col!='SK_ID_CURR':
            columns.append(col)
    flag = data[data.duplicated(subset = columns, keep=False)]
    return flag

# Imputing categorical missing values with mode
def imputing_categorical_missing_values(data):
    for col in data.columns:
        col_type = data[col].dtype
        if col_type == object:
            data[col] = data[col].fillna(data[col].mode().iloc[0])
    return data

# One Hot Encoding the categorical variables
def encoding_categorical(data):
    # Dropping the first column to not get into dummy variable trap
    data = pd.get_dummies(data, drop_first = True)
    return data

# Mean Imputation To impute the missing values in numerical columns
def mean_imputation(data):
    mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    data = pd.DataFrame(mean_imputer.fit_transform(data), columns=data.columns)
    return data

# MICE Imputation :-> To impute the missing values in numerical columns
def mice_imputation(data):
    mice_imputer = IterativeImputer(estimator=linear_model.BayesianRidge())
    df_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(data), columns=data.columns)
    return df_mice_imputed

def IV_calculation(X, y, head = 100):
    clf = WOE()
    clf.fit(X, y)
    iv_values = clf.iv_df.head(head)
    return iv_values

def tuning_splits(X, y):
    n_splits = [5, 6, 7, 8, 9,10]
    scores = []
    for s in n_splits:
        kf = StratifiedKFold(n_splits=s, shuffle=True, random_state=42)
        score = cross_val_score(lm.LogisticRegression(), X, y, cv= kf, scoring="roc_auc")
        scores.append(np.mean(score))
    return scores

def tuning_C(X, y):
    C = [0.001, 0.01, 0.1, 1, 10]
    scores = []
    kf = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    for c in C:
        score = cross_val_score(lm.LogisticRegression(C = c, random_state = 42), X, y, cv= kf, scoring="roc_auc")
        scores.append(np.mean(score))
    return scores

def tuning_solver(X, y):
    algo = ['lbfgs', 'liblinear']
    scores = []
    kf = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    for alg in algo:
        score = cross_val_score(lm.LogisticRegression(max_iter = 3000, solver = alg, random_state = 42), X, y, cv= kf, scoring="roc_auc")
        scores.append(np.mean(score))
    return scores
    

def model(X, y, tuning = False):
    if tuning == False:
        logreg = LogisticRegression()
    else:
        logreg = LogisticRegression(C=0.01, class_weight = 'balanced', max_iter=200, solver = '')
        
    kf = StratifiedKFold(n_splits=8,shuffle=True, random_state=42)
    cv_scores = cross_validate(logreg, X, Y, n_jobs=-1, cv=kf, scoring ='roc_auc')
    
    return cv_scores, logreg

# Installing the required Libraries
    Installing the libraries for calculating Information Value and Cramer's.

In [1]:
!pip install xverse
!pip install association-metrics

Collecting xverse
  Downloading xverse-1.0.5-py3-none-any.whl (21 kB)
Installing collected packages: xverse
Successfully installed xverse-1.0.5
Collecting association-metrics
  Downloading association-metrics-0.0.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: association-metrics
  Building wheel for association-metrics (setup.py) ... [?25ldone
[?25h  Created wheel for association-metrics: filename=association_metrics-0.0.1-py3-none-any.whl size=3928 sha256=75ade96569ff1af6ceb436bf4c56178f7b691955a9d067ba16d4f0724f4c89f0
  Stored in directory: /Users/sidhant/Library/Caches/pip/wheels/49/ef/10/d93c981055c4fc0401028bc9dfda085ddd5ca204bcda0f2110
Successfully built association-metrics
Installing collected packages: association-metrics
Successfully installed association-metrics-0.0.1


# Importing the Libraries

In [6]:
# Importing the Libraries
import numpy as np
import pandas as pd

# Removing the Warnings
import warnings
warnings.filterwarnings("ignore")

# Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# Data Preprocessing
from sklearn.preprocessing import MinMaxScaler

# Imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import linear_model
from sklearn.impute import SimpleImputer

# Importing the Logestic Regression
from sklearn.linear_model import LogisticRegression
import sklearn.linear_model as lm


# Performance Metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Cross Validation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

# For calculation of Weight of Evidence and Information Value
from xverse.transformer import WOE

# Cramers V
import association_metrics as am

AttributeError: module 'sklearn.metrics._dist_metrics' has no attribute 'DistanceMetric32'

In [7]:
sklearn.metrics._dist_metrics

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement sklearn.metrics._dist_metrics (from versions: none)
ERROR: No matching distribution found for sklearn.metrics._dist_metrics


# Importing the Dataset

In [None]:
# Importing the Dataset
df = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")
print('The shape of data:',df.shape)
df.head()

# Data-Preprocessing
    In this we gonna check for missing value's, dropping the missing value column's, checking for duplicate value's and treating the anomalies.
    Also, we will impute the missing value's using Mean as well as MICE Imputation.

In [None]:
pip install sklearn

In [None]:
from sklearn.impute import IterativeImputer

In [None]:
pip install --upgrade scikit-learn

In [5]:
pip install --upgrade imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Checking for Missing Values
checking_missing_values(df)

In [None]:
# Removing those columns which are having more than 40% of missing values
df = dropping_missing_values(df)

# Shape of application after dropping the columns having more than 40% missing values
df.shape

In [None]:
# Checking for duplicate data
temp = checking_duplicate_data(df)
print(f"The no. of duplicates in the data: {temp.shape[0]}\n")

print(f"Cleaning Started....\n")
# Cleaning columns as some columns contain XNA instead of NA and some are having typing mistake
## Column -> CODE_GENDER
df['CODE_GENDER'] = df['CODE_GENDER'].replace("XNA", np.nan)
## Column -> ORGANIZATION_TYPE
df['ORGANIZATION_TYPE'] = df['ORGANIZATION_TYPE'].replace("XNA", "Other")
## Column -> DAYS_EMPLOYED
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace(365243, np.nan)
print(f"Cleaning done!!!")

# Getting application without ID and Target
application_without_ID_Target = df.drop(['SK_ID_CURR', 'TARGET'], axis = 1)

print("Imputing and Encoding Categorical Variables with Mode....\n")

# Imputing the categorical Values with Mode
application_without_ID_Target = imputing_categorical_missing_values(application_without_ID_Target)

# Encoding the Categorical Columns
application_without_ID_Target = encoding_categorical(application_without_ID_Target)

# Imputing with Mode as it is categorical
application_without_ID_Target['CNT_FAM_MEMBERS'] = application_without_ID_Target['CNT_FAM_MEMBERS'].fillna(application_without_ID_Target['CNT_FAM_MEMBERS'].mode().iloc[0])

print("Imputing and Encoding Done!!!\n")

In [None]:
# print("Imputing Numeriacal Varaible Started.....\n")
# print("Mean Imputation Started....\n")

# # Imputing using Mean Strategy on numerical Columns
# application_mean = mean_imputation(application_without_ID_Target)

# print("Saving the Mean Imputed File.....")
# # Saving the final application after imputation into a csv
# application_mean['SK_ID_CURR'] = df['SK_ID_CURR']
# application_mean['TARGET'] = df['TARGET']
# application_mean.to_csv('/kaggle/working/application_mean.csv',index=False)

application_mean = pd.read_csv("application_mean.csv") # Here read the file
X_mean = application_mean.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_mean = application_mean['TARGET']
# print("Saved!!!")

# print(f"Mean Imputation Done!!!\n")

# print(f"MICE Imputation Started....\n")
# application_mice = mice_imputation(application_without_ID_Target)

# print("Saving the MICE imputed file....")
# # Saving the final application after imputation into a csv
# application_mice['SK_ID_CURR'] = df['SK_ID_CURR']
# application_mice['TARGET'] = df['TARGET']
# application_mice.to_csv('/kaggle/working/application_mice.csv',index=False)

application_mice = pd.read_csv("application_mice.csv") # Here read the file
X_mice = application_mice.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_mice = application_mice['TARGET']
# print("Saved!!!")

# print(f"MICE Imputation Done!!!!\n")

    Checking if imputation didnt change the distribution of data 

In [None]:
# Mean Imputation result

# MICE Imputation result

# Baseline Model
    In this we would fir two model's for each category one would be the basic and another would be the tuned one. We will use Stratified KFold Cross Validation for that and as a performance metric we will use AUC(Area under Curve) ROC(Reciever Operating Characterstics) as it is highly imbalanced dataset.
    

In [None]:
# Creating a dataframe to keep the track of all the results 
df_models = pd.DataFrame(columns = ['Model Name', 'Algorithm', 'Features', 'Target', 'Train AUC', 'Valid AUC', 'Test AUC'])

#### Mean Imputed with no Tuning

In [None]:
# Mean Imputed Model with no tuning
print(f"STARTING MEAN IMPUTATION NO TUNING MODEL FITTING>>>\n")
X_train, X_test, y_train, y_test = train_test_split(X_mean, y_mean, random_state = 42, test_size = 0.2)

print("Started KFold Cross Validation.....")
logreg = LogisticRegression()
kf = StratifiedKFold(n_splits=8,shuffle=True, random_state=42)
cv_scores = cross_validate(logreg, X_train, y_train, n_jobs=-1, cv=kf, scoring ='roc_auc',return_train_score = True)
print(f"KFold Cross Validation Ended!!!!\n")
print(f"Total time taken to fit the model is {np.sum(cv_scores['fit_time'])/60} min\n")

print("Starting predicting on the Test data....")
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
test_auc = roc_auc_score(y_test, y_pred)
print("Prediction done on test data!!!!\n")

print(f"Appending the resulted Model in the Models Dataframe.....")
df_models = df_models.append({'Model Name':'Mean Imputed No Tuning',
                             'Algorithm':'Logistic Regression',
                             'Features':list(X_mean.columns.values),
                             'Target':'0/1',
                             'Train AUC':np.mean(cv_scores['train_score']),
                             'Valid AUC':np.mean(cv_scores['test_score']),
                             'Test AUC':test_auc}, ignore_index=True)
print(f"Model is added in the Models Dataframe!!!\n")

print(f"ENDED THE MEAN IMPUTATION NO TUNING MODEL FITTING!!!!!\n")

#### Mean Imputed with Tuning

In [None]:
# Finding the tuned parameters
# print("Finding the regularization parameter.....")
# c_score = tuning_C(X, y)
# print("Finding the Solver.....")
# solver = tuning_solver(X_mice, y_mice)
# print("Solver found!!!!")
# print("Finding the correct k-folds.....")
# n_splits = tuning_splits(X, y)

In [None]:
# Mean Imputed Model with tuning
print(f"STARTING MEAN IMPUTATION TUNING MODEL FITTING>>>\n")
X_train, X_test, y_train, y_test = train_test_split(X_mean, y_mean, random_state = 42, test_size = 0.2)

print("Started KFold Cross Validation.....")
logreg = LogisticRegression(C = 0.1, penalty = 'l1',random_state = 42, solver ='liblinear', max_iter= 3000, class_weight='balanced')
kf = StratifiedKFold(n_splits=8,shuffle=True, random_state=42)
cv_scores = cross_validate(logreg, X_train, y_train, n_jobs=-1, cv=kf, scoring ='roc_auc',return_train_score = True)
print(f"KFold Cross Validation Ended!!!!\n")
print(f"Total time taken to fit the model is {np.sum(cv_scores['fit_time'])/60} min\n")

print("Starting predicting on the Test data....")
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
test_auc = roc_auc_score(y_test, y_pred)
print("Prediction done on test data!!!!\n")

print(f"Appending the resulted Model in the Models Dataframe.....")
df_models = df_models.append({'Model Name':'Mean Imputed With Tuning',
                             'Algorithm':'Logistic Regression',
                             'Features':list(X_mean.columns.values),
                             'Target':'0/1',
                             'Train AUC':np.mean(cv_scores['train_score']),
                             'Valid AUC':np.mean(cv_scores['test_score']),
                             'Test AUC':test_auc}, ignore_index=True)
print(f"Model is added in the Models Dataframe!!!\n")

print(f"ENDED THE MEAN IMPUTATION WITH TUNING MODEL FITTING!!!!!\n")

#### MICE Imputed with no tuning

In [None]:
# Mice Imputed Model with no tuning
print(f"STARTING MICE IMPUTATION WITH NO TUNING MODEL FITTING>>>\n")
X_train, X_test, y_train, y_test = train_test_split(X_mice, y_mice, random_state = 42, test_size = 0.2)

print("Started KFold Cross Validation.....")
logreg = LogisticRegression()
kf = StratifiedKFold(n_splits=8,shuffle=True, random_state=42)
cv_scores = cross_validate(logreg, X_train, y_train, n_jobs=-1, cv=kf, scoring ='roc_auc',return_train_score = True)
print(f"KFold Cross Validation Ended!!!!\n")
print(f"Total time taken to fit the model is {np.sum(cv_scores['fit_time'])/60} min\n")

print("Starting predicting on the Test data....")
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
test_auc = roc_auc_score(y_test, y_pred)
print("Prediction done on test data!!!!\n")

print(f"Appending the resulted Model in the Models Dataframe.....")
df_models = df_models.append({'Model Name':'MICE Imputed No Tuning',
                             'Algorithm':'Logistic Regression',
                             'Features':list(X_mice.columns.values),
                             'Target':'0/1',
                             'Train AUC':np.mean(cv_scores['train_score']),
                             'Valid AUC':np.mean(cv_scores['test_score']),
                             'Test AUC':test_auc}, ignore_index=True)
print(f"Model is added in the Models Dataframe!!!\n")

print(f"ENDED THE MICE IMPUTATION WITH NO TUNING MODEL FITTING!!!!!\n")

#### MICE Imputed with Tuning

In [None]:
# Finding the tuned parameters
# print("Finding the regularization parameter.....")
# c_score = tuning_C(X, y)
# print("Finding the Solver.....")
# solver = tuning_solver(X_mice, y_mice)
# print("Solver found!!!!")
# print("Finding the correct k-folds.....")
# n_splits = tuning_splits(X, y)

In [None]:
# Mice Imputed Model with tuning
print(f"STARTING MICE IMPUTATION WITH TUNING MODEL FITTING>>>\n")
X_train, X_test, y_train, y_test = train_test_split(X_mice, y_mice, random_state = 42, test_size = 0.2)

print("Started KFold Cross Validation.....")
logreg = LogisticRegression(C = 0.1, penalty = 'l1',random_state = 42, solver ='liblinear', max_iter= 3000, class_weight='balanced')
kf = StratifiedKFold(n_splits=8,shuffle=True, random_state=42)
cv_scores = cross_validate(logreg, X_train, y_train, n_jobs=-1, cv=kf, scoring ='roc_auc',return_train_score = True)
print(f"KFold Cross Validation Ended!!!!\n")
print(f"Total time taken to fit the model is {np.sum(cv_scores['fit_time'])/60} min\n")

print("Starting predicting on the Test data....")
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
test_auc = roc_auc_score(y_test, y_pred)
print("Prediction done on test data!!!!\n")

print(f"Appending the resulted Model in the Models Dataframe.....")
df_models = df_models.append({'Model Name':'MICE Imputed with Tuning',
                             'Algorithm':'Logistic Regression',
                             'Features':list(X_mice.columns.values),
                             'Target':'0/1',
                             'Train AUC':np.mean(cv_scores['train_score']),
                             'Valid AUC':np.mean(cv_scores['test_score']),
                             'Test AUC':test_auc}, ignore_index=True)
print(f"Model is added in the Models Dataframe!!!\n")

print(f"ENDED THE MICE IMPUTATION WITH TUNING MODEL FITTING!!!!!\n")

In [None]:
df_models

## Feature Selection in Application 

In [None]:
# Feature Selection using IV
print("IV Calculation begin.....")
iv_df = IV_calculation(X_mice, y_mice, 20)

print("IV Calculation Done!!!!")
## Feature selected from IV are those between 0.03 to 0.1 are evaluated 
## and greater than 0.1 is Retained
iv_df

In [None]:
# Dropping all other columns
application = application_mice[['EXT_SOURCE_3','EXT_SOURCE_2','DAYS_EMPLOYED','DAYS_BIRTH','AMT_GOODS_PRICE','AMT_CREDIT', 
                                'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE','DAYS_REGISTRATION', 'REGION_RATING_CLIENT', 
                                'REGION_RATING_CLIENT_W_CITY', 'TARGET','SK_ID_CURR',
                                'NAME_EDUCATION_TYPE_Higher education']]

# Importing the other datasets and Merging
    Importing the other files and merging with the reduced application as we will try to increase the Models performance.

In [None]:
# Train Shape
print(f'Initial Train Shape:{application.shape}')
print("")


# Combining Dataframes to Train
def merge_train(df):
    train_final = pd.merge(application, df, how='left', on = ['SK_ID_CURR'])
    return train_final

train = merge_train(bureau)
print(f'New shape after bureau/bureau balance: {train.shape}')
print("")

train = merge_train(POS_CASH_balance)
print(f'New shape after POS_CASH_balance: {train.shape}')
print("")

train = merge_train(installments_payments)
print(f'New shape after installments_payments: {train.shape}')
print("")

train = merge_train(credit_card_balance)
print(f'New shape after credit_card_balance: {train.shape}')
print("")

final_df = merge_train(previous_application)
print(f'New shape after previous_application: {final_df.shape}')
print("")

In [None]:
checking_missing_values(final_df)

In [None]:
final_df = dropping_missing_values(final_df)

In [None]:
X = final_df.drop(['SK_ID_CURR', 'TARGET', 'SK_ID_PREV'], axis = 1)
y = final_df['TARGET']


iv_df = IV_calculation(X, y, 38)

In [None]:
final_df = final_df[['EXT_SOURCE_3', 'EXT_SOURCE_2', 'DAYS_EMPLOYED', 'DAYS_BIRTH','REGION_RATING_CLIENT_W_CITY', 
                    'PRODUCT_COMBINATION', 'REGION_RATING_CLIENT', 'CODE_REJECT_REASON', 'DAYS_LAST_PHONE_CHANGE','NAME_CONTRACT_STATUS',
                    'NAME_EDUCATION_TYPE_Higher education', 'DAYS_ID_PUBLISH', 'AMT_GOODS_PRICE_x', 'AMT_CREDIT_x']]

In [None]:
checking_missing_values(final_df)

In [None]:
final_df = imputing_categorical_missing_values(final_df)

In [None]:
final_df = encoding_categorical(final_df)

In [None]:
X = final_df
y = y

In [None]:
# Splitting the dataset into train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [None]:
logreg = LogisticRegression(C=0.1, penalty='l1',class_weight = 'balanced', random_state=42, solver = 'liblinear')
kf = StratifiedKFold(n_splits=8,shuffle=True, random_state=42)
cv_scores = cross_validate(logreg, X_train, y_train, n_jobs=-1, cv=kf, scoring ='roc_auc', return_train_score = True)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [None]:
print(f"Training ROC": np.mean(cv_scores['train_score']))
print(f"Validating ROC: {np.mean(cv_scores['test_score'])}")
print(f"Test ROC: {roc_auc_score(y_test, y_pred)}")