In [229]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Problem Statement

MyHom is a finance company that lends housing loans at the best and most affordable interest rates to customers. In recent times, the company incurred heavy losses due to loan defaults. Most applicants failed to repay the loan as per the promissory note.


In order to avoid such losses, the company has decided to build a system for identifying the loan defaulters automatically based on data. This will help the company to identify the potential applicants and ensure the smooth running of the entire process.


Now, the company challenges the Data Science community to build a smart AI system to predict the probability of an applicant defaulting the loan or not in the future.



## About the Dataset


You are provided with the past applicant’s data containing the demographic information, loan attributes, and target variable indicating if an applicant will default the loan or not.



## Train and Test Data


The train and test set contains the different attributes related to demographic and loan information of the applicants such as age, profession, no. of active loans, loan default in previous loans, and so on. The training set contains the target variable loan_default and you need to predict the target variable in the test set.




loan_id- Unique identifier of a loan

age- Age of the Applicant

Education- Applicant Education

proof_submitted- Type of proof submitted

loan_amount- Loan Amount Disbursed

asset_cost- The total asset value of the applicant

no_of_loans- No. of the loans taken by the applicant 

no_of_curr_loans- No. of active loans held by the applicant

last_delinq_none- The loan defaulted in at least one of the past loans

loan_default (Target Variable)- 0/1 indicating if an applicant will default the loan or not


## Evaluation metric


The evaluation metric for this hackathon would be the macro F1 Score.


In [230]:
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import MissingIndicator
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge

In [231]:
import matplotlib.pyplot as plt

plt.style.use('ggplot')

import seaborn as sns
from tqdm import tqdm_notebook as tqdm 

from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

In [232]:
#   print "f1 score macro",metrics.f1_score(data_Y, predicted, average='macro') 

In [233]:
train_path = '../input/train-data/train.csv'
test_path = '../input/test-data/test.csv'
sub_path = '../input/sample-solution/sample_submission.csv'

In [234]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_sub = pd.read_csv(sub_path)

In [235]:
print(f'Rows : {df_train.shape[0]}  Columns : {df_train.shape[1]}')
print()
df_train.head(30)

In [237]:
df_train.info()

In [238]:
df_test.info()

In [239]:
print(len(df_train.columns)) 
df_train.columns 

In [240]:
print(len(df_test.columns)) 
df_test.columns 

In [241]:
df_train.isnull().sum() 

In [242]:
target = df_train['loan_default'] 

In [243]:
## Checking Target Imbalance
import plotly.graph_objects as go

colors = ['gold', 'mediumturquoise']
labels = ['0','1']
values = df_train['loan_default'].value_counts()/df_train['loan_default'].shape[0]

# Use `hole` to create a donut-like pie chart
fig=go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(title_text="Outcome")
fig.show()

In [244]:
df_combo = pd.concat([df_train.drop(['loan_id','loan_default'],axis=1),df_test.drop(['loan_id'],axis=1)],ignore_index = True) 
print(f'Rows : {df_combo.shape[0]}  Columns : {df_combo.shape[1]}') 
print() 
df_combo.head() 

In [245]:
df_combo.describe([0.05,0.25,0.50,0.75,0.90,0.95,0.99]).T

In [246]:
dict1 = {"Aadhar": 0, "VoterID" : 1, "PAN":2, "Driving":3, "Passport": 4 }

df_combo['proof_submitted'] = df_combo['proof_submitted'].map(dict1)

In [247]:
df_combo.head()

In [248]:
miss_indicator = MissingIndicator()
X_miss = miss_indicator.fit_transform(df_combo[['education']])
X_miss.shape

In [249]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
xgb_impt = CatBoostRegressor()

In [250]:
itr_imputer = IterativeImputer(estimator = xgb_impt,add_indicator=True) 
X = itr_imputer.fit_transform(df_combo)
X.shape

In [251]:
df_combo['education'].value_counts()

In [252]:
df_combo['last_delinq_none'].value_counts()


In [253]:
def cap(x):
    if x>=1.5:
        return 2
    else:
        return 1

In [254]:
df_combo['education'] = pd.DataFrame(X)[1]
df_combo['ind_education'] = pd.DataFrame(X)[8]
# df_combo['education'] = df_combo['education'].apply(cap)
df_combo

In [255]:
df_combo['diff_amount'] = df_combo['asset_cost'] - df_combo['loan_amount']
df_combo['loan_complected'] =  df_combo['no_of_loans'] - df_combo['no_of_curr_loans']


In [257]:
df_combo.isnull().sum() 

In [258]:
df_combo.info()

In [259]:
df_combo['proof_submitted'].value_counts(dropna=False)

In [262]:
col_names = [x for x in df_combo.columns]
for col in col_names:
    print(df_combo[col].value_counts())

In [263]:
plt.figure(figsize=(15,10))
for i,col in enumerate(df_combo.columns,1):
    plt.subplot(4,3,i)
    plt.title(f"Distribution of {col} Data")
    sns.histplot(df_combo[col],kde=True)
    plt.tight_layout()
    plt.plot()

In [264]:
num_col =['asset_cost','no_of_loans','no_of_curr_loans']

In [265]:
df_combo

In [266]:
from sklearn.mixture import GaussianMixture

n_components = 3
gmm = GaussianMixture(n_components = n_components)
 
# Fit the GMM model for the dataset
# which expresses the dataset as a
# mixture of 3 Gaussian Distribution
gmm.fit(df_combo['age'].values.reshape(-1,1))
 
# Assign a label to each sample
labels = gmm.predict(df_combo['age'].values.reshape(-1,1))
df_combo['label_age']= labels
# plot three clusters in same plot
sns.displot(data=df_combo, x='age', hue='label_age', kind='kde', fill=True, palette=sns.color_palette('bright')[:n_components], height=5, aspect=1.5)

In [267]:
from sklearn.mixture import GaussianMixture

n_components = 3
gmm = GaussianMixture(n_components = n_components)
 
# Fit the GMM model for the dataset
# which expresses the dataset as a
# mixture of 3 Gaussian Distribution
gmm.fit(df_combo['no_of_loans'].values.reshape(-1,1))
 
# Assign a label to each sample
labels = gmm.predict(df_combo['no_of_loans'].values.reshape(-1,1))
df_combo['label_no_of_loans']= labels
# plot three clusters in same plot
sns.displot(data=df_combo, x='no_of_loans', hue='label_no_of_loans', kind='kde', fill=True, palette=sns.color_palette('bright')[:n_components], height=5, aspect=1.5)

In [268]:
from sklearn.mixture import GaussianMixture

n_components = 2
gmm = GaussianMixture(n_components = n_components)
 
# Fit the GMM model for the dataset
# which expresses the dataset as a
# mixture of 2 Gaussian Distribution
gmm.fit(df_combo['no_of_curr_loans'].values.reshape(-1,1))
 
# Assign a label to each sample
labels = gmm.predict(df_combo['no_of_curr_loans'].values.reshape(-1,1))
df_combo['label_no_of_curr_loans']= labels
# plot three clusters in same plot
sns.displot(data=df_combo, x='no_of_curr_loans', hue='label_no_of_curr_loans', kind='kde', fill=True, palette=sns.color_palette('bright')[:n_components], height=5, aspect=1.5)

### Checking Correlation Plot of df_combo

In [270]:
#Outcome correlation matrix
corr_matrix = df_combo.corr()
# Create a mask
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

fig=plt.figure(figsize=(16, 8))
fig.suptitle('Plotting Correlation Matrix\n_________________', y=0.94, horizontalalignment='center', verticalalignment='center',fontsize=18, fontweight='bold',
              fontfamily='serif', color="black")

sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

#### Breaking the combo data into train df and test df again as the preprocessing is done

In [271]:
train = df_combo.head(df_train.shape[0])
train['target'] = target

test = df_combo.tail(df_test.shape[0]).reset_index().drop('index',axis=1)

##### Checking Correlation Plot of df_train with target col ,i.e. loan_default to check how the target col is correlated to other columns

In [272]:
corr = train.corr(method = 'pearson')
plt.figure(figsize=(10,10))
sns.heatmap(corr , cmap=sns.diverging_palette(150, 30, as_cmap=True))
plt.title('Pearson Correlation Plot')
plt.show()

### Modelling

In [273]:
X = train.drop('target',axis=1)
y = target

test = test

In [274]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### XGBC

In [275]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score,accuracy_score,f1_score
from sklearn.metrics import classification_report

xgbc = XGBClassifier(scale_pos_weight = 1.5)

xgbc.fit(x_train,y_train,eval_set=[[x_train,y_train],[x_test,y_test]])

y_test_preds_xgbc = xgbc.predict(x_test)

print()
print('F1 Score   '+str(f1_score(y_test, y_test_preds_xgbc)))
print()
print(classification_report(y_test, y_test_preds_xgbc))

In [276]:
from catboost import CatBoostClassifier

cbc = CatBoostClassifier(auto_class_weights= 'Balanced')

cbc.fit(x_train,y_train,eval_set=(x_test,y_test))

y_test_preds_proba_cbc = cbc.predict_proba(x_test)
y_test_preds_cbc = cbc.predict(x_test)

print()
print('F1 Score   '+str(f1_score(y_test, y_test_preds_cbc)))
print()
print(classification_report(y_test, y_test_preds_cbc))

In [277]:
import lightgbm as lgb
lgc = lgb.LGBMClassifier(is_unbalance= True) 


lgc.fit(x_train,y_train,eval_set=[[x_train,y_train],[x_test,y_test]])

y_test_preds_proba_lgc = lgc.predict_proba(x_test)
y_test_preds_lgc = lgc.predict(x_test)

print('F1 Score   '+str(f1_score(y_test, y_test_preds_lgc)))
print()
print(classification_report(y_test, y_test_preds_lgc))

In [278]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth = 5, n_estimators=100, class_weight='balanced')
rf.fit(x_train,y_train)

y_test_preds_proba_rf = rf.predict_proba(x_test)
y_test_preds_rf = rf.predict(x_test)

print('F1 Score   '+str(f1_score(y_test, y_test_preds_rf)))
print()
print(classification_report(y_test, y_test_preds_rf))

In [279]:
from sklearn.model_selection import GridSearchCV

In [None]:
preds = rf.predict(test)

In [None]:
df_test

In [None]:
df_sub

In [None]:
### final

df_test['loan_default']= preds

df_sub.drop('loan_default', axis=1, inplace= True) 
df = pd.merge( df_sub, df_test[['loan_id','loan_default']] , on='loan_id')

df.to_csv('submission.csv',index=False)
df