# Loan Payment Data

# Data Description

### Context
This data set includes customers who have paid off their loans, who have been past due and put into collection without paying back their loan and interests, and who have paid off only after they were put in collection. The financial product is a bullet loan that customers should pay off all of their loan debt in just one time by the end of the term, instead of an installment schedule. Of course, they could pay off earlier than their pay schedule.

### Content

* **Loan_id**:  A unique loan number assigned to each loan customers

* **Loan_status**:  Whether a loan is paid off, in collection, new customer yet to payoff, or paid off after the collection efforts

* **Principal**:  Basic principal loan amount at the origination

* **Effective_date**:  When the loan got originated and took effects

* **Due_date**:  Since it’s one-time payoff schedule, each loan has one single due date

* **Paidoff_time**:  The actual time a customer pays off the loan

* **Pastdue_days**:  How many days a loan has been past due

* **Age, education, gender**:  A customer’s basic demographic information

In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/loandata/Loan payments data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isna().sum()

In [None]:
df.nunique()

# EDA and Preprocessing

## Loan_ID  
A unique loan number assigned to each loan customers

It is just ID, No Predicted affect on Modelling

In [None]:
df.drop('Loan_ID', axis=1, inplace=True)

## loan_status 
Whether a loan is paid off, in collection, new customer yet to payoff, or paid off after the collection efforts

In [None]:
df.loan_status.value_counts()

## Principal
Basic principal loan amount at the origination

In [None]:
df.Principal.value_counts()

## terms
Can be weekly (7 days), biweekly, and monthly payoff schedule

In [None]:
df.terms.value_counts()

## Effective_date
When the loan got originated and took effects

In [None]:
df.effective_date.value_counts()

In [None]:
# Convert to datetime data type
df['effective_date'] = pd.to_datetime(df['effective_date'])
df['effective_date'].dtype

## Due_date
Since it’s one-time payoff schedule, each loan has one single due date

In [None]:
df.due_date.value_counts()

In [None]:
# Convert to datetime data type
df['due_date'] = pd.to_datetime(df['due_date'])
df['due_date'].dtype

## Paidoff_time
The actual time a customer pays off the loan

In [None]:
df[df['loan_status'] == 'COLLECTION_PAIDOFF']

In [None]:
df[df['loan_status'] == 'COLLECTION']

### paid_off_time is 'NaN' when loan status is 'COLLECTION'

In [None]:
# Convert to datetime data type
df['paid_off_time'] = pd.to_datetime(df['paid_off_time']).dt.date
df['paid_off_time'].dtype

In [None]:
df['paid_off_time'] = pd.to_datetime(df['paid_off_time'])
df['paid_off_time']

In [None]:
df['paid_off_time'].fillna(0,inplace=True)

## Pastdue_days
How many days a loan has been past due

In [None]:
df[df['loan_status'] == 'PAIDOFF']

### past_due_days is 'NaN' when loan status is 'PAIDOFF'

In [None]:
df['past_due_days'].fillna(0,inplace=True)

In [None]:
df.isna().sum()

### No more missing values

## Age, education, gender
A customer’s basic demographic information

In [None]:
i=1
plt.figure(figsize=[20,15])
if i<=3:
    for col in df[['age','education','Gender']]:
        plt.subplot(4,4,i)
        sns.countplot(x=col,data=df)
        plt.xticks(rotation=45)
        i+=1

In [None]:
plt.figure(figsize=[12,5])
sns.barplot(x='loan_status',y='age',hue='Gender',data=df);

In [None]:
df.groupby(['Gender','education'])['loan_status'].count().plot(kind='barh');

In [None]:
sns.pairplot(df);

##  We can consider only past_due_days and drop all dates 

In [None]:
df.drop(['effective_date', 'due_date', 'paid_off_time'], axis=1, inplace= True)
df.head()

# Splittting Features and label

In [None]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [None]:
X= pd.get_dummies(X, drop_first=True)
X.head()

In [None]:
y.replace({'PAIDOFF': 0, 'COLLECTION_PAIDOFF': 1, 'COLLECTION': 2}, inplace=True)
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

## First Modeling

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
models = [LogisticRegression,
          KNeighborsClassifier,
          SVC,
          MLPClassifier,
          DecisionTreeClassifier,
          RandomForestClassifier,
          GradientBoostingClassifier,
          XGBClassifier,
          LGBMClassifier,
          CatBoostClassifier]

In [None]:
Score_accuracy=[]
Score_f1=[]
ModelName=[]


for model in models:
    alg = model().fit(X_train,y_train)
    y_pred = alg.predict(X_test)
    Score_accuracy.append((accuracy_score(y_test, y_pred))*100)
    Score_f1.append((f1_score(y_test, y_pred, average='weighted'))*100)
    ModelName.append(model.__name__)
    Results = pd.DataFrame({'ModelName':ModelName, 'Score_accuracy':Score_accuracy, 'Score_f1':Score_f1})

In [None]:
Results

In [None]:
plt.figure(figsize=[12,8])
plt.subplot(2,2,1)
sns.barplot(x='ModelName',y='Score_accuracy',data=Results.sort_values('Score_accuracy',ascending=False))
plt.title('Score_accuracy')
plt.xticks(rotation=90);
plt.subplot(2,2,2)
sns.barplot(x='ModelName',y='Score_f1',data=Results.sort_values('Score_f1',ascending=False))
plt.title('Score_f1')
plt.xticks(rotation=90);

## Hyperparameter Optimization

In [None]:
ModelName=[]
Score_f1=[]
Score_accuracy=[]

In [None]:
# LogisticRegression
tuned=LogisticRegression(solver='liblinear').fit(X_train, y_train)
ModelName.append(LogisticRegression.__name__)
Score_f1.append(f1_score(y_test, tuned.predict(X_test), average='weighted'))
Score_accuracy.append(accuracy_score(y_test, tuned.predict(X_test))) 

In [None]:
# KNeighborsClassifier
model=KNeighborsClassifier()
params = {'n_neighbors':np.arange(1,25)}
tuned = KNeighborsClassifier(**(GridSearchCV(model,params,cv=5,verbose=2,n_jobs=-1).fit(X_train, y_train).best_params_)).fit(X_train, y_train)
ModelName.append(KNeighborsClassifier.__name__)
Score_f1.append(f1_score(y_test, tuned.predict(X_test), average='weighted'))
Score_accuracy.append(accuracy_score(y_test, tuned.predict(X_test))) 

In [None]:
# Support Vector Machine
model=SVC()
params = {'C': np.arange(1,10), 'kernel': ['linear', 'rbf']}
tuned = SVC(**(GridSearchCV(model,params,cv=5,verbose=2,n_jobs=-1).fit(X_train, y_train).best_params_)).fit(X_train, y_train)
ModelName.append(SVC.__name__)
Score_f1.append(f1_score(y_test, tuned.predict(X_test), average='weighted'))
Score_accuracy.append(accuracy_score(y_test, tuned.predict(X_test)))

In [None]:
# MLPClassifier
model=MLPClassifier()
params = {'alpha': [1,0.1,0.01,0.03,0.005,0.0001],'hidden_layer_sizes': [(10,10),(100,100),(3,5)]}
tuned = MLPClassifier(**(GridSearchCV(model,params,cv=5,verbose=2,n_jobs=-1).fit(X_train, y_train).best_params_)).fit(X_train, y_train)
ModelName.append(MLPClassifier.__name__)
Score_f1.append(f1_score(y_test, tuned.predict(X_test), average='weighted'))
Score_accuracy.append(accuracy_score(y_test, tuned.predict(X_test)))

In [None]:
# DecisionTreeClassifier
model=DecisionTreeClassifier()
params = {'min_samples_split':[2,5,10,20],'max_depth':[3,5,8]}
tuned = DecisionTreeClassifier(**(GridSearchCV(model,params,cv=5,verbose=2,n_jobs=-1).fit(X_train, y_train).best_params_)).fit(X_train, y_train)
ModelName.append(DecisionTreeClassifier.__name__)
Score_f1.append(f1_score(y_test, tuned.predict(X_test), average='weighted'))
Score_accuracy.append(accuracy_score(y_test, tuned.predict(X_test)))

In [None]:
# RandomForestClassifier
model=RandomForestClassifier()
params = {'n_estimators':[100,200,500],'max_features':[3,5,8],'min_samples_split':[3,8]}
tuned = RandomForestClassifier(**(GridSearchCV(model,params,cv=5,verbose=2,n_jobs=-1).fit(X_train, y_train).best_params_)).fit(X_train, y_train)
ModelName.append(RandomForestClassifier.__name__)
Score_f1.append(f1_score(y_test, tuned.predict(X_test), average='weighted'))
Score_accuracy.append(accuracy_score(y_test, tuned.predict(X_test))) 

In [None]:
# GradientBoostingClassifier
model=GradientBoostingClassifier()
params = {'n_estimators':[100,300,500],'learning_rate':[0.1,0.01,0.001],'max_depth': [2,3,5]}
tuned = GradientBoostingClassifier(**(GridSearchCV(model,params,cv=5,verbose=2,n_jobs=-1).fit(X_train, y_train).best_params_)).fit(X_train, y_train)
ModelName.append(GradientBoostingClassifier.__name__)
Score_f1.append(f1_score(y_test, tuned.predict(X_test), average='weighted'))
Score_accuracy.append(accuracy_score(y_test, tuned.predict(X_test)))

In [None]:
# XGBClassifier
tuned=XGBClassifier().fit(X_train, y_train)
ModelName.append(XGBClassifier.__name__)
Score_f1.append(f1_score(y_test, tuned.predict(X_test), average='weighted'))
Score_accuracy.append(accuracy_score(y_test, tuned.predict(X_test))) 

In [None]:
# LGBMClassifier
model=LGBMClassifier()
params = {'learning_rate':[0.001,0.01,0.1],'n_estimators':[200,500,1000],'max_depth':[1,2,3,5,8]}        
tuned = LGBMClassifier(**(GridSearchCV(model,params,cv=5,verbose=2,n_jobs=-1).fit(X_train, y_train).best_params_)).fit(X_train, y_train)
ModelName.append(LGBMClassifier.__name__)
Score_f1.append(f1_score(y_test, tuned.predict(X_test), average='weighted'))
Score_accuracy.append(accuracy_score(y_test, tuned.predict(X_test)))

In [None]:
# CatBoostClassifier
model=CatBoostClassifier()
params = {'iterations':[200,500],'learning_rate':[0.01,0.03,],'depth':[4,8]}        
tuned = CatBoostClassifier(**(GridSearchCV(model,params,cv=5,verbose=2,n_jobs=-1).fit(X_train, y_train).best_params_)).fit(X_train, y_train)
ModelName.append(CatBoostClassifier.__name__)
Score_f1.append(f1_score(y_test, tuned.predict(X_test), average='weighted'))
Score_accuracy.append(accuracy_score(y_test, tuned.predict(X_test)))

In [None]:
Results_best = pd.DataFrame({'ModelName':ModelName, 'Score_f1':Score_f1, 'Score_accuracy':Score_accuracy})
Results_best

In [None]:
plt.figure(figsize=[15,8])
plt.subplot(2,2,1)
sns.barplot(x='ModelName',y='Score_accuracy',data=Results_best.sort_values('Score_accuracy',ascending=False))
plt.title('Score_accuracy')
plt.xticks(rotation=90);
plt.subplot(2,2,2)
sns.barplot(x='ModelName',y='Score_f1',data=Results_best.sort_values('Score_f1',ascending=False))
plt.title('Score_f1')
plt.xticks(rotation=90);