In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, auc, roc_curve, roc_auc_score, balanced_accuracy_score, classification_report
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

dt = datetime.today()


def removeOutlier(dataSet, feature):    
    q1=dataSet[feature].quantile(0.25)
    q3=dataSet[feature].quantile(0.75)
    IQR=q3-q1
    lowerLimit = q1 - 1.5 * IQR
    UpperLimit = q3 + 1.5 * IQR 
    dataSet = dataSet[dataSet[feature]< UpperLimit]
    dataSet = dataSet[dataSet[feature]> lowerLimit]
    return dataSet

def modelEvaluation(model, X_test, y_train,y_test, y_pred,cols=None):
    print("Accurancy: {:.3f}".format(accuracy_score(y_test, y_pred)))
    print("ROC AUC Score: {:.3f}".format(roc_auc_score(y_test, y_pred)))
    print("F1 Score:: {:.3f} ".format(f1_score(y_test, y_pred)))
    print("Balanced Accurancy Score:: {:.3f} ".format(balanced_accuracy_score(y_test, y_pred)))
    print('\n clasification report:\n', classification_report(y_test,y_pred))
    
    fig = plt.figure(figsize=(10,6))
    ax = fig.add_subplot(111)
    prediction_probabilities = model.predict_proba(X_test[cols])[:,1]
    fpr , tpr , thresholds = roc_curve(y_test,prediction_probabilities)
    ax.plot(fpr,tpr,label = ["Area under curve : ",auc(fpr,tpr)],linewidth=2,linestyle="dotted")
    ax.plot([0,1],[0,1],linewidth=2,linestyle="dashed")
    plt.legend(loc="best")
    plt.title("ROC-CURVE and AREA UNDER CURVE")
    ax.set_facecolor("k")
    
def calculate_age(row):
    try:
        yrs, mon = tuple(row['AVERAGE_ACCT_AGE'].split(' '))
        age = round(float(yrs.replace('yrs', '')) + float(mon.replace('mon', ''))/12, 2)
        row['AVERAGE_ACCT_AGE'] = age

        yrs, mon = tuple(row['CREDIT_HISTORY_LENGTH'].split(' '))
        age = round(float(yrs.replace('yrs', '')) + float(mon.replace('mon', ''))/12, 2)
        row['CREDIT_HISTORY_LENGTH'] = age
    except Exception as e:
        print(row, e)
        raise e
    return row


def binning_by_depth_factor(df_column, factor):
    divs, max_da, min_da= round(np.sqrt(len(df_column))/factor), df_column.max(), df_column.min()
    step = (max_da - min_da)/divs
    return (df_column/step).astype(int)*divs


training_csv = '/kaggle/input/vehicle-loan-default-prediction/train.csv'
test_csv = '/kaggle/input/vehicle-loan-default-prediction/test.csv'

train_df = pd.read_csv(training_csv)
train_df = train_df[~train_df['LOAN_DEFAULT'].isna()]
print(train_df.columns)

train_df.head()

# Data Exploration

In [None]:
train_df.describe()

In [None]:
train_df['LOAN_DEFAULT'].astype(bool).value_counts().plot.pie()
train_df['LOAN_DEFAULT'].astype(bool).value_counts()

> The pie chart of the target class shows the skewness of data, leaning towards False and thus needs to be accounted for. 
>
> Will be applying Resampling Technique to balance the data

In [None]:
print('Before Resampling:')
print(train_df['LOAN_DEFAULT'].value_counts())
df_majority = train_df[train_df['LOAN_DEFAULT']==0]
df_minority = train_df[train_df['LOAN_DEFAULT']==1]


df_minority_upsampled = resample(df_minority, 
                                 replace=True,                  # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123)              # reproducible results
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
print('After Resampling:')
print(df_upsampled['LOAN_DEFAULT'].value_counts())


In [None]:
flags = ['MOBILENO_AVL_FLAG', 'AADHAR_FLAG', 'PAN_FLAG', 'VOTERID_FLAG', 'DRIVING_FLAG', 'PASSPORT_FLAG']
fig, axs = plt.subplots(len(flags), 2, figsize=(10, 10))

for _i, flag in enumerate(flags):
    for i, (vals, group) in enumerate(train_df.groupby([flag])):
        group.groupby(['LOAN_DEFAULT']).size().plot.pie(ax=axs[_i][i])
        axs[_i][i].set_title(str(flag)+' '+str(vals))
plt.tight_layout()

In [None]:

cols = ['DISBURSED_AMOUNT', 'ASSET_COST', 'LTV']
fig, axs = plt.subplots(len(cols), figsize=(10, 5*len(cols)))

for j, col in enumerate(cols):
    for i, (val, group) in enumerate(train_df.groupby('LOAN_DEFAULT')):
        column = binning_by_depth_factor(group[col], 10) 
        column.value_counts().to_frame('counts').reset_index().sort_values('index').plot(x='index', y='counts', label=val, ax=axs[j])
        axs[j].set_xlabel(col)
        axs[j].set_ylabel('counts')


> Disbursed amount and asset cost in both both classes showing gaussian distribution but with different distribution frequencies. 
>
> This shows that the defaulters are defaulting mostly in lesser amounts of loan and for lesser asset costs.
>
> Additionally as expected, lower LTV value customers are also defaulting.

In [None]:
train_df['DOB'] = pd.to_datetime(train_df['DATE_OF_BIRTH'], format='%d-%m-%Y', errors='coerce')
train_df['DD'] = pd.to_datetime(train_df['DISBURSAL_DATE'], format='%d-%m-%Y', errors='coerce')


train_df = pd.DataFrame(train_df[~train_df['DOB'].isna()])
train_df = pd.DataFrame(train_df[~train_df['DD'].isna()])
train_df['APPLICANT_AGE'] = ((dt - train_df['DOB']).apply(lambda x: float(x.days)) / 365.0)
train_df['DISBURSAL_AGE'] = ((dt - train_df['DD']).apply(lambda x: float(x.days)) / 365.0)

cols = ['APPLICANT_AGE', 'DISBURSAL_AGE']

for j, col in enumerate(cols):
    fig, axs = plt.subplots(1, len(cols), figsize=(20, 5), sharey=True)
    for i, (val, group) in enumerate(train_df.groupby('LOAN_DEFAULT')):
        group[col].plot.box( ax=axs[i], label=val)
        axs[i].set_title(col)
        axs[i].set_xlabel("Loan Default: %s"%bool(val))
    plt.show()

> As seen from the box plots, Applicant Age seems to have almost no impact on the defaulting, similar behaviour is seen in Disbursal Age

In [None]:
train_df.groupby('PERFORM_CNS_SCORE_DESCRIPTION').agg({'PERFORM_CNS_SCORE':[np.min, np.max]}).sort_values(('PERFORM_CNS_SCORE', 'amin'))

> PERFORM_CNS_SCORE_DESCRIPTION seems to be a bucket label for PERFORM_CNS_SCORE

# Data Cleaning

In [None]:
train_df = train_df[~train_df['LOAN_DEFAULT'].isna()]

total_records = len(train_df)

analysis = []

train_df = train_df.apply(calculate_age, axis=1)
train_df['AVERAGE_ACCT_AGE'] = train_df['AVERAGE_ACCT_AGE'].astype(float)
train_df['CREDIT_HISTORY_LENGTH'] = train_df['CREDIT_HISTORY_LENGTH'].astype(float)

train_df['AADHAR_FLAG'] = train_df['AADHAR_FLAG'].astype(bool)
train_df['PAN_FLAG'] = train_df['PAN_FLAG'].astype(bool)
train_df['VOTERID_FLAG'] = train_df['VOTERID_FLAG'].astype(bool)
train_df['DRIVING_FLAG'] = train_df['DRIVING_FLAG'].astype(bool)
train_df['PASSPORT_FLAG'] = train_df['PASSPORT_FLAG'].astype(bool)

train_df['DATE_OF_BIRTH'] = pd.to_datetime(train_df['DATE_OF_BIRTH'])
train_df['DISBURSAL_DATE'] = pd.to_datetime(train_df['DISBURSAL_DATE'])

train_df['APPLICANT_AGE'] = ((dt - train_df['DATE_OF_BIRTH']) / 365).apply(lambda x: float(x.days))
train_df['DISBURSAL_AGE'] = ((dt - train_df['DISBURSAL_DATE']) / 365).apply(lambda x: float(x.days))

train_df = train_df.drop('DATE_OF_BIRTH', axis=1)
train_df = train_df.drop('DISBURSAL_DATE', axis=1)
train_df = train_df.drop('PERFORM_CNS_SCORE_DESCRIPTION', axis=1)
train_df = train_df.drop('MOBILENO_AVL_FLAG', axis=1)
train_df = train_df.drop('UNIQUEID', axis=1)

train_df.info()

### Outlier Handling

In [None]:

train_df = removeOutlier(train_df, 'DISBURSED_AMOUNT')
train_df = removeOutlier(train_df, 'ASSET_COST')
train_df.shape

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(train_df.corr(), ax=ax, vmin=0, vmax=1, cmap="YlGnBu")

In [None]:
corr_df = train_df.corr()
for col in corr_df.columns[:]:
    correlated_fields = list(filter(lambda x: x!=col, corr_df[(np.abs(corr_df[col])>.75)][col].index.values))
    correlated_values = list(filter(lambda x: x!=col, corr_df[(np.abs(corr_df[col])>.75)][col].values))
    if correlated_fields:
        print(col, ':', ', '.join(map(str, zip(correlated_fields, correlated_values))), sep='\t')
    

> From the Correlation Graph and the code filter above, we see that the below fields are highly correlated
>
> -	PRI_NO_OF_ACCTS	:	PRI_ACTIVE_ACCTS
> -	PRI_ACTIVE_ACCTS	:	PRI_NO_OF_ACCTS
> -	PRI_CURRENT_BALANCE	:	PRI_SANCTIONED_AMOUNT, PRI_DISBURSED_AMOUNT
> -	PRI_SANCTIONED_AMOUNT	:	PRI_CURRENT_BALANCE, PRI_DISBURSED_AMOUNT
> -	PRI_DISBURSED_AMOUNT	:	PRI_CURRENT_BALANCE, PRI_SANCTIONED_AMOUNT
> -	SEC_NO_OF_ACCTS	:	SEC_ACTIVE_ACCTS
> -	SEC_ACTIVE_ACCTS	:	SEC_NO_OF_ACCTS
> -	SEC_CURRENT_BALANCE	:	SEC_SANCTIONED_AMOUNT, SEC_DISBURSED_AMOUNT
> -	SEC_SANCTIONED_AMOUNT	:	SEC_CURRENT_BALANCE, SEC_DISBURSED_AMOUNT
> -	SEC_DISBURSED_AMOUNT	:	SEC_CURRENT_BALANCE, SEC_SANCTIONED_AMOUNT
> -	AVERAGE_ACCT_AGE	:	CREDIT_HISTORY_LENGTH
> -	CREDIT_HISTORY_LENGTH	:	AVERAGE_ACCT_AGE
>
> From the above, we can remove the below columns from the feature set
> - PRI_NO_OF_ACCTS
> - PRI_SANCTIONED_AMOUNT
> - SEC_SANCTIONED_AMOUNT
> - AVERAGE_ACCT_AGE


In [None]:
train_df = train_df.drop(['PRI_NO_OF_ACCTS', 'PRI_SANCTIONED_AMOUNT', 'SEC_SANCTIONED_AMOUNT', 'AVERAGE_ACCT_AGE'], axis=1)

In [None]:
state_counts_df = train_df.groupby(['STATE_ID', 'LOAN_DEFAULT']).size().to_frame('counts').reset_index()

states_counts =[(ld, group) for ld, group in state_counts_df.groupby(['LOAN_DEFAULT'])]

for ld, state_df in states_counts:
    state_df.columns=['STATE_ID', 'LOAN_DEFAULT', 'COUNT_%s'%bool(ld)]
    
states_counts_df = states_counts[0][1].merge(states_counts[1][1], on='STATE_ID')
states_counts_df = states_counts_df.drop(['LOAN_DEFAULT_x', 'LOAN_DEFAULT_y'], axis=1)
states_counts_df['total']= states_counts_df['COUNT_False']+ states_counts_df['COUNT_True']
states_counts_df['COUNT_False']= states_counts_df['COUNT_False']/ states_counts_df['total']
states_counts_df['COUNT_True']= states_counts_df['COUNT_True']/ states_counts_df['total']
states_counts_df

### Predicting and filling back in the EMPLOYMENT_TYPE Columns

In [None]:
'Employment Type Blanks: %d'% len(train_df[train_df['EMPLOYMENT_TYPE'].isna()])

---
### Using a RandomForestClassifier to fill in Employment Type Field for missing values

In [None]:
train_df['EMPLOYMENT_TYPE'].value_counts()

In [None]:
feature_columns = [column for column in train_df.columns if column not in ['LOAN_DEFAULT', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_code', 'DD', 'DOB']]
target_variable = 'EMPLOYMENT_TYPE'

emp_type_df = pd.DataFrame(train_df[~train_df[target_variable].isna()])

df_majority = emp_type_df[emp_type_df['EMPLOYMENT_TYPE']=='Self employed']
df_minority = emp_type_df[emp_type_df['EMPLOYMENT_TYPE']=='Salaried']

df_minority_upsampled = resample(df_minority, 
                                 replace=True,                  # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123)              # reproducible results

emp_type_df = pd.concat([df_majority, df_minority_upsampled])

emp_type_df['%s_code'%target_variable], employment_type_map = emp_type_df[target_variable].factorize()

X_df = emp_type_df[feature_columns]
Y_df = emp_type_df['%s_code'%target_variable]

x_train, x_test, y_train, y_test = train_test_split(X_df, Y_df)
classifier = RandomForestClassifier()
classifier.fit(x_train, y_train)
y_predict = classifier.predict(x_test)
print(confusion_matrix(y_test, y_predict), accuracy_score(y_test, y_predict), f1_score(y_test, y_predict))


to_pred_df = pd.DataFrame(train_df[train_df[target_variable].isna()])
to_pred_df['%s_code'%target_variable] = -1
to_pred_df['%s_code'%target_variable] = classifier.predict(to_pred_df[feature_columns])
train_df[train_df[target_variable].isna()][target_variable] = to_pred_df['%s_code'%target_variable].apply(lambda x: employment_type_map[x])


### Final Transformation Function:


In [None]:
def predict_employement_type(df):
    df = pd.DataFrame(df)
    feature_columns = [column for column in df.columns if column not in ['LOAN_DEFAULT', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_code', 'DD', 'DOB']]
    target_variable = 'EMPLOYMENT_TYPE'

    emp_type_df = pd.DataFrame(df[~df[target_variable].isna()])
    df_majority = emp_type_df[emp_type_df['EMPLOYMENT_TYPE']=='Self employed']
    df_minority = emp_type_df[emp_type_df['EMPLOYMENT_TYPE']=='Salaried']

    df_minority_upsampled = resample(df_minority, 
                                     replace=True,                  # sample with replacement
                                     n_samples=len(df_majority),    # to match majority class
                                     random_state=123)              # reproducible results

    emp_type_df = pd.concat([df_majority, df_minority_upsampled])
    emp_type_df['%s_code'%target_variable], employment_type_map = emp_type_df[target_variable].factorize()

    X_df = emp_type_df[feature_columns]
    Y_df = emp_type_df['%s_code'%target_variable]

    x_train, x_test, y_train, y_test = train_test_split(X_df, Y_df)
    classifier = RandomForestClassifier()
    classifier.fit(x_train, y_train)
    y_predict = classifier.predict(x_test)
    print(confusion_matrix(y_test, y_predict), accuracy_score(y_test, y_predict), f1_score(y_test, y_predict))

    to_pred_df = pd.DataFrame(df[df[target_variable].isna()])
    to_pred_df['%s_code'%target_variable] = -1
    to_pred_df['%s_code'%target_variable] = classifier.predict(to_pred_df[feature_columns])
    df[df[target_variable].isna()][target_variable] = to_pred_df['%s_code'%target_variable].apply(lambda x: employment_type_map[x])
    return df

def prepare(df):
    df = pd.DataFrame(df)
    df = df.apply(calculate_age, axis=1)
    df = removeOutlier(df, 'DISBURSED_AMOUNT')
    df = removeOutlier(df, 'ASSET_COST')

    # Set data types
    df['AVERAGE_ACCT_AGE'] = df['AVERAGE_ACCT_AGE'].astype(float)
    df['CREDIT_HISTORY_LENGTH'] = df['CREDIT_HISTORY_LENGTH'].astype(float)
    df['AADHAR_FLAG'] = df['AADHAR_FLAG'].astype(bool)
    df['PAN_FLAG'] = df['PAN_FLAG'].astype(bool)
    df['VOTERID_FLAG'] = df['VOTERID_FLAG'].astype(bool)
    df['DRIVING_FLAG'] = df['DRIVING_FLAG'].astype(bool)
    df['PASSPORT_FLAG'] = df['PASSPORT_FLAG'].astype(bool)
    
    # Parse Dates
    df['DATE_OF_BIRTH'] = pd.to_datetime(df['DATE_OF_BIRTH'])
    df['DISBURSAL_DATE'] = pd.to_datetime(df['DISBURSAL_DATE'])
    
    df['APPLICANT_AGE'] = ((dt - df['DATE_OF_BIRTH']) / 365).apply(lambda x: float(x.days))
    df['DISBURSAL_AGE'] = ((dt - df['DISBURSAL_DATE']) / 365).apply(lambda x: float(x.days))
    emp_type_df = pd.get_dummies(df['EMPLOYMENT_TYPE'], prefix='EMPLOYMENT_TYPE', drop_first=True)
    df[emp_type_df.columns.tolist()] = emp_type_df
    
    # Dropping Columns
    columns_to_drop = ['DATE_OF_BIRTH', 'DISBURSAL_DATE', 'PERFORM_CNS_SCORE_DESCRIPTION','MOBILENO_AVL_FLAG','UNIQUEID', 'PRI_NO_OF_ACCTS', 'PRI_SANCTIONED_AMOUNT', 'SEC_SANCTIONED_AMOUNT', 'AVERAGE_ACCT_AGE']
    df = df.drop(columns_to_drop, axis=1)

    #Fill back Employment Type
    df = predict_employement_type(df)
    df = df.drop('EMPLOYMENT_TYPE', axis=1)

    return df

train_df = pd.read_csv(training_csv)
train_df = prepare(train_df)
train_df.head()

# Training Model: RandomForestClassifier

In [None]:
feature_columns = ['DISBURSED_AMOUNT', 'ASSET_COST', 'LTV', 'BRANCH_ID', 'SUPPLIER_ID',
       'MANUFACTURER_ID', 'CURRENT_PINCODE_ID', 'STATE_ID',
       'EMPLOYEE_CODE_ID', 'AADHAR_FLAG', 'PAN_FLAG', 'VOTERID_FLAG',
       'DRIVING_FLAG', 'PASSPORT_FLAG', 'PERFORM_CNS_SCORE',
       'PRI_ACTIVE_ACCTS', 'PRI_OVERDUE_ACCTS', 'PRI_CURRENT_BALANCE',
       'PRI_DISBURSED_AMOUNT', 'SEC_NO_OF_ACCTS', 'SEC_ACTIVE_ACCTS',
       'SEC_OVERDUE_ACCTS', 'SEC_CURRENT_BALANCE', 'SEC_DISBURSED_AMOUNT',
       'PRIMARY_INSTAL_AMT', 'SEC_INSTAL_AMT', 'NEW_ACCTS_IN_LAST_SIX_MONTHS',
       'DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS', 'CREDIT_HISTORY_LENGTH',
       'NO_OF_INQUIRIES',  'APPLICANT_AGE', 'DISBURSAL_AGE',
       'EMPLOYMENT_TYPE_Self employed']
target_column = 'LOAN_DEFAULT'

print('Before Resampling:')
print(train_df['LOAN_DEFAULT'].value_counts())
df_majority = train_df[train_df['LOAN_DEFAULT']==0]
df_minority = train_df[train_df['LOAN_DEFAULT']==1]


df_minority_upsampled = resample(df_minority, 
                                 replace=True,                  # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123)              # reproducible results
train_df = pd.concat([df_majority, df_minority_upsampled])
print('After Resampling:')
print(train_df['LOAN_DEFAULT'].value_counts())


X_df = train_df[feature_columns]
Y_df = train_df[target_column]

x_train, x_test, y_train, y_test = train_test_split(X_df, Y_df)

### Using LogisticRegression

In [None]:
classifier  = LogisticRegression()
classifier.fit(x_train, y_train)
y_predict = classifier.predict(x_test)
modelEvaluation(classifier,x_test, y_train, y_test, y_predict, x_train.columns)


### Using DecisionTreeClassifier

In [None]:
classifier  = DecisionTreeClassifier()
classifier.fit(x_train, y_train)
y_predict = classifier.predict(x_test)
modelEvaluation(classifier,x_test, y_train, y_test, y_predict, x_train.columns)


### Using RandomForestClassifier

In [None]:
classifier = RandomForestClassifier()
classifier.fit(x_train, y_train)
y_predict = classifier.predict(x_test)
modelEvaluation(classifier,x_test, y_train, y_test, y_predict, x_train.columns)
