In [474]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import sklearn.tree# To impute missing values
from sklearn.impute import KNNImputer ,SimpleImputer
from sklearn.tree import DecisionTreeClassifier
# To tune different models
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    BaggingClassifier,
    RandomForestClassifier    
)

from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline as imbpipeline

# To perform statistical analysis
import scipy.stats as stats

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# To get diferent metric scores
from sklearn import metrics
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    plot_confusion_matrix,
    make_scorer,
)
# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

In [566]:
data = pd.read_csv("Train_set.csv")


In [432]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93174 entries, 0 to 93173
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          93174 non-null  int64  
 1   loan_amnt                   93174 non-null  int64  
 2   loan_term                   93174 non-null  object 
 3   interest_rate               93174 non-null  float64
 4   loan_grade                  93174 non-null  object 
 5   loan_subgrade               93174 non-null  object 
 6   job_experience              88472 non-null  object 
 7   home_ownership              93174 non-null  object 
 8   annual_income               93173 non-null  float64
 9   income_verification_status  93174 non-null  object 
 10  loan_purpose                93174 non-null  object 
 11  state_code                  93174 non-null  object 
 12  debt_to_income              93174 non-null  float64
 13  delinq_2yrs                 931

In [623]:
dt = pd.read_csv("Test_set.csv")

In [129]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39933 entries, 0 to 39932
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          39933 non-null  int64  
 1   loan_amnt                   39933 non-null  int64  
 2   loan_term                   39933 non-null  object 
 3   interest_rate               39933 non-null  float64
 4   loan_grade                  39933 non-null  object 
 5   loan_subgrade               39933 non-null  object 
 6   job_experience              37844 non-null  object 
 7   home_ownership              39933 non-null  object 
 8   annual_income               39933 non-null  float64
 9   income_verification_status  39933 non-null  object 
 10  loan_purpose                39933 non-null  object 
 11  state_code                  39933 non-null  object 
 12  debt_to_income              39933 non-null  float64
 13  delinq_2yrs                 399

In [567]:
df = data.copy()

In [None]:
df.drop(['state_code','loan_subgrade','ID'],axis=1,inplace=True)

In [624]:
dt.drop(['state_code','loan_subgrade','ID'],axis=1,inplace=True)

In [586]:
for cols in df.columns[df.dtypes == 'object']:
       df[cols]= df[cols].astype('category')

In [625]:
for cols in dt.columns[dt.dtypes == 'object']:
       dt[cols]= dt[cols].astype('category')

In [583]:
df['loan_term']=df['loan_term'].str.replace(' years','')

In [626]:
dt['loan_term']=dt['loan_term'].str.replace(' years','')

In [572]:
df['job_experience']=df['job_experience'].str.replace('<5 Years','LT5')

In [627]:
dt['job_experience']=dt['job_experience'].str.replace('<5 Years','LT5')

In [578]:
df['job_experience'].unique()

array(['LT5', nan, 'GT10', 'LT10'], dtype=object)

In [632]:
dt['job_experience'].unique()

array(['LT5', 'GT10', 'LT10', nan], dtype=object)

In [574]:
df['job_experience']=df['job_experience'].str.replace('+','')

In [629]:
dt['job_experience']=dt['job_experience'].str.replace('+','')

In [575]:
df['job_experience']=df['job_experience'].str.replace('10 years','GT10')

In [630]:
dt['job_experience']=dt['job_experience'].str.replace('10 years','GT10')

In [576]:
df['job_experience']=df['job_experience'].str.replace('6-GT10','LT10')

In [631]:
dt['job_experience']=dt['job_experience'].str.replace('6-GT10','LT10')

In [443]:
#df['job_experience']=df['job_experience'].fillna('NONE')

In [143]:
#dt['job_experience']=dt['job_experience'].fillna('NONE')

In [589]:
X = df.drop(["default"], axis=1)
y = df["default"]

In [633]:
Xt= dt.iloc[:,:]

In [590]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y
)

In [67]:
#from sklearn.preprocessing import MinMaxScaler
# define min max scaler
#scaler = MinMaxScaler()
 
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled =scaler.fit_transform(X_test)

In [176]:
print(X_train.shape,X_test.shape)

(65221, 19) (27953, 19)


In [592]:
X_train.head()

Unnamed: 0,loan_amnt,loan_term,interest_rate,loan_grade,job_experience,home_ownership,annual_income,income_verification_status,loan_purpose,debt_to_income,delinq_2yrs,public_records,revolving_balance,total_acc,interest_receive,application_type,last_week_pay,total_current_balance,total_revolving_limit
40883,18000,3,10.64,B,LT5,MORTGAGE,62000.0,Verified,debt_consolidation,20.11,1.0,0.0,4993,19.0,2922.63,INDIVIDUAL,122.0,168705.0,8300.0
70146,12000,5,13.33,C,GT10,MORTGAGE,65000.0,Not Verified,debt_consolidation,27.53,0.0,1.0,6539,16.0,377.39,INDIVIDUAL,13.0,58054.0,7300.0
84472,24000,3,7.9,A,LT5,RENT,102000.0,Verified,credit_card,11.6,0.0,0.0,14564,24.0,2684.27,INDIVIDUAL,113.0,28443.0,23600.0
81649,15000,3,14.31,C,GT10,MORTGAGE,76000.0,Source Verified,credit_card,32.95,0.0,0.0,28111,18.0,1738.33,INDIVIDUAL,52.0,210143.0,41900.0
47161,10000,3,8.18,B,GT10,OWN,61000.0,Source Verified,debt_consolidation,9.33,2.0,0.0,11382,44.0,253.47,INDIVIDUAL,17.0,196576.0,48400.0


In [116]:
def confusion_matrix_sklearn(model, predictors, target):
    """
    To plot the confusion_matrix with percentages

    model: classifier
    predictors: independent variables
    target: dependent variable
    """
    y_pred = model.predict(predictors)
    cm = confusion_matrix(target, y_pred)
    labels = np.asarray(
        [
            ["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
            for item in cm.flatten()
        ]
    ).reshape(2, 2)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=labels, fmt="")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")

In [593]:
#missing data imputation

numerical_features = ['annual_income','delinq_2yrs','public_records','total_acc','last_week_pay','total_current_balance','total_revolving_limit']
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")),("sc1", StandardScaler())])
categorical_features = [
  'loan_grade',
  'job_experience',
  'home_ownership',
  'income_verification_status',
  'loan_purpose',
  'application_type',
]

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
        
    ]
)
#Buiding preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
        
    ],
    remainder="passthrough",
)

In [681]:
# Creating new pipeline with best parameters
model = Pipeline(
    steps=[
        ("pre", preprocessor),
         ( "XGB",           
             GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
                               n_estimators=3000,
                               learning_rate=0.2,
                               subsample=0.9,
                               max_features=0.9,
                              criterion='friedman_mse',
                              max_depth=9,
                              warm_start=True
        ),
       ),
    ]
)
# Fit the model on training data
model.fit(X_train, y_train)

Pipeline(steps=[('pre',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('sc1',
                                                                   StandardScaler())]),
                                                  ['annual_income',
                                                   'delinq_2yrs',
                                                   'public_records',
                                                   'total_acc', 'last_week_pay',
                                                   'total_current_balance',
                                                   'total_revolving_limit']),
                                                 ('cat',
                      

In [329]:
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance



    model: classifier
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "Accuracy": acc,
            "Recall": recall,
            "Precision": precision,
            "F1": f1,
        },
        index=[0],
    )

    return df_perf

In [73]:
def confusion_matrix_sklearn(model, predictors, target):
    """
    To plot the confusion_matrix with percentages

    model: classifier
    predictors: independent variables
    target: dependent variable
    """
    y_pred = model.predict(predictors)
    cm = confusion_matrix(target, y_pred)
    labels = np.asarray(
        [
            ["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
            for item in cm.flatten()
        ]
    ).reshape(2, 2)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=labels, fmt="")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")

In [682]:
# metrics on test data from Test.csv
model_perf = model_performance_classification_sklearn(
    model, X_train,y_train
)
print("Testing performance for Train.csv:")
model_perf

Testing performance for Train.csv:


Unnamed: 0,Accuracy,Recall,Precision,F1
0,1.0,1.0,1.0,1.0


In [683]:
# metrics on test data from Test.csv
model_perf = model_performance_classification_sklearn(
    model, X_test,y_test
)
print("Testing performance for Train.csv:")
model_perf

Testing performance for Train.csv:


Unnamed: 0,Accuracy,Recall,Precision,F1
0,0.868887,0.580358,0.814243,0.677689


In [686]:
var= model.predict(Xt)

In [687]:
result = pd.DataFrame()

In [690]:
result['default']=var

In [688]:
dt = pd.read_csv("Test_set.csv")

In [689]:
result['ID']= dt['ID']

In [691]:
result.head()

Unnamed: 0,ID,default
0,4855329,0
1,66862420,0
2,3637416,1
3,53682249,0
4,53937165,0


In [693]:
result.to_csv('results9.csv')