In [12]:
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [13]:
train_dataset = pd.read_csv('../input/train_LTFS.csv')
train_dataset.columns

Index(['UniqueID', 'disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID', 'Date.of.Birth',
       'Employment.Type', 'DisbursalDate', 'State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS',
       'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES',
       'loan_default'],
      dtype='object')

In [14]:
preprocessed_dataset = train_dataset

# will do some experiment and check which feature is affecting the loan defaults.
#it seems like few branches have more loan default than others. we include this in final training
'''
preprocessed_dataset.groupby('branch_id')['loan_default'].value_counts()
preprocessed_dataset.groupby('Employment.Type')['loan_default'].value_counts()
preprocessed_dataset.groupby('PERFORM_CNS.SCORE.DESCRIPTION')['loan_default'].value_counts()
preprocessed_dataset.groupby('NO.OF_INQUIRIES')['loan_default'].value_counts()
preprocessed_dataset.groupby('credit_history_duration')['loan_default'].value_counts()
'''

#creating one new column 'age' from date of birth column and drop the previous column

age_calculation = []
#In pandas, one issue is there. year before 68 will be counted as 2068, 2067, 2068....To fix this issue, re is used here
for x in list(preprocessed_dataset['Date.of.Birth']):
    if int(re.findall('\d+-\d+-(\d+)', x)[0]) <= 68 and int(re.findall('\d+-\d+-(\d+)', x)[0]) >= 20:
        age_calculation.append(pd.to_datetime('today').year - pd.to_datetime(x).year + 100)
    else:
        age_calculation.append(pd.to_datetime('today').year - pd.to_datetime(x).year)
    
preprocessed_dataset['age'] = age_calculation
preprocessed_dataset = preprocessed_dataset.drop('Date.of.Birth', axis = 1)


#converting the credit history column data into year wise. This value '1yrs 11mo' will be converted to 2.
#similarily '0yrs 3mon' will be converted to 0. if month value is less than 6 then same year, else next year value  

credit_history_length = []
for duration in list(preprocessed_dataset['CREDIT.HISTORY.LENGTH']):
    month_value = float(re.findall('yrs\s(\d+)\w+', duration)[0])
    year_value = float(re.findall('(\d+)yrs', duration)[0])
    cal = year_value + (month_value/12) 
    credit_history_length.append(cal)
        
preprocessed_dataset['credit_history_length'] = credit_history_length
preprocessed_dataset = preprocessed_dataset.drop('CREDIT.HISTORY.LENGTH', axis = 1)



#simple way to check different unique value in particular column and it's total count
'''
a = np.array(preprocessed_dataset['age'])
a, b = np.unique(a, return_counts=True)
dict(zip(a, b))
'''


preprocessed_dataset.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Employment.Type,DisbursalDate,State_ID,Employee_code_ID,MobileNo_Avl_Flag,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRI.DISBURSED.AMOUNT,SEC.NO.OF.ACCTS,SEC.ACTIVE.ACCTS,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,NO.OF_INQUIRIES,loan_default,age,credit_history_length
0,420825,50578,58400,89.55,67,22807,45,1441,Salaried,03-08-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0yrs 0mon,0,0,35,0.0
1,537409,47145,65550,73.23,67,22807,45,1502,Self employed,26-09-18,6,1998,1,1,0,0,0,0,598,I-Medium Risk,1,1,1,27600,50200,50200,0,0,0,0,0,0,1991,0,0,1,1yrs 11mon,0,1,34,1.916667
2,417566,53278,61360,89.63,67,22807,45,1497,Self employed,01-08-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0yrs 0mon,0,0,34,0.0
3,624493,57513,66113,88.48,67,22807,45,1501,Self employed,26-10-18,6,1998,1,1,0,0,0,0,305,L-Very High Risk,3,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,0yrs 8mon,1,1,26,1.25
4,539055,52378,60300,88.39,67,22807,45,1495,Self employed,26-09-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0yrs 0mon,1,1,42,0.0


In [15]:
preprocessed_dataset = preprocessed_dataset.drop('UniqueID', axis = 1)
preprocessed_dataset = preprocessed_dataset.drop('Current_pincode_ID', axis = 1)
preprocessed_dataset = preprocessed_dataset.drop('MobileNo_Avl_Flag', axis = 1)
preprocessed_dataset = preprocessed_dataset.drop('DisbursalDate', axis = 1)
preprocessed_dataset.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Employment.Type,State_ID,Employee_code_ID,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRI.DISBURSED.AMOUNT,SEC.NO.OF.ACCTS,SEC.ACTIVE.ACCTS,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,NO.OF_INQUIRIES,loan_default,age,credit_history_length
0,50578,58400,89.55,67,22807,45,Salaried,6,1998,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0yrs 0mon,0,0,35,0.0
1,47145,65550,73.23,67,22807,45,Self employed,6,1998,1,0,0,0,0,598,I-Medium Risk,1,1,1,27600,50200,50200,0,0,0,0,0,0,1991,0,0,1,1yrs 11mon,0,1,34,1.916667
2,53278,61360,89.63,67,22807,45,Self employed,6,1998,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0yrs 0mon,0,0,34,0.0
3,57513,66113,88.48,67,22807,45,Self employed,6,1998,1,0,0,0,0,305,L-Very High Risk,3,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,0yrs 8mon,1,1,26,1.25
4,52378,60300,88.39,67,22807,45,Self employed,6,1998,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0yrs 0mon,1,1,42,0.0


In [16]:
# for handling the AVERAGE.ACCT.AGE feature
average_account_age = []
for duration in list(preprocessed_dataset['AVERAGE.ACCT.AGE']):
    month_value = float(re.findall('yrs\s(\d+)\w+', duration)[0])
    year_value = float(re.findall('(\d+)yrs', duration)[0])
    cal = year_value + (month_value/12) 
    average_account_age.append(cal)
        
preprocessed_dataset['average_account_age'] = average_account_age
preprocessed_dataset = preprocessed_dataset.drop('AVERAGE.ACCT.AGE', axis = 1)
preprocessed_dataset.groupby('average_account_age')['loan_default'].value_counts()

average_account_age  loan_default
0.000000             0               91716
                     1               27657
0.083333             0                1679
                     1                 528
0.166667             0                2295
                     1                 638
0.250000             0                2761
                     1                 809
0.333333             0                3382
                     1                 887
0.416667             0                3518
                     1                 836
0.500000             0                4842
                     1                1186
0.583333             0                4332
                     1                1034
0.666667             0                3939
                     1                 953
0.750000             0                4058
                     1                 960
0.833333             0                4161
                     1                 982
0.916667            

In [17]:
preprocessed_dataset.groupby('PERFORM_CNS.SCORE.DESCRIPTION')['loan_default'].value_counts()
label_encoding = LabelEncoder()
preprocessed_dataset['PERFORM_CNS.SCORE.DESCRIPTION'] = label_encoding.fit_transform(preprocessed_dataset['PERFORM_CNS.SCORE.DESCRIPTION'])
preprocessed_dataset.groupby('PERFORM_CNS.SCORE.DESCRIPTION')['loan_default'].value_counts()

PERFORM_CNS.SCORE.DESCRIPTION  loan_default
0                              0               11783
                               1                2341
1                              0                7993
                               1                1208
2                              0               13275
                               1                2770
3                              0                9659
                               1                1699
4                              0                4821
                               1                1000
5                              0                6905
                               1                1580
6                              0                3202
                               1                 786
7                              0                5197
                               1                1658
8                              0                4042
                               1                1515
9 

In [19]:
#dealing with missing data in Employment Type by directly deleting those rows
preprocessed_dataset['Employment.Type'].unique()
preprocessed_dataset = preprocessed_dataset.dropna(subset = ['Employment.Type'])


#preprocessed_dataset['Employment.Type'] = preprocessed_dataset['Employment.Type'].astype('category')
#preprocessed_dataset.columns[preprocessed_dataset.isnull().any()]


#handling the employee Type feature and converting it into one hot encoding
label_encoding = LabelEncoder()
preprocessed_dataset['Employment.Type'] = label_encoding.fit_transform(preprocessed_dataset['Employment.Type'])

one_hot_encoder = OneHotEncoder(categorical_features=[6])
one_hot_encoder_matrix = one_hot_encoder.fit_transform(preprocessed_dataset).toarray()
employment_type_dataframe = pd.DataFrame(one_hot_encoder_matrix, columns = ['Employment_Type_Salaried', 'Employment_Type_Self_Employed','disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'supplier_id',
       'manufacturer_id', 'State_ID', 'Employee_code_ID',
       'Aadhar_flag', 'PAN_flag', 'VoterID_flag', 'Driving_flag',
       'Passport_flag', 'PERFORM_CNS.SCORE', 'PERFORM_CNS.SCORE.DESCRIPTION',
       'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS',
       'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT',
       'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS',
       'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT',
       'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'NO.OF_INQUIRIES',
       'loan_default', 'age', 'credit_history_length', 'average_account_age'])

print(preprocessed_dataset.shape)
print(employment_type_dataframe.shape)
print(preprocessed_dataset.head())
print(employment_type_dataframe.head())
preprocessed_dataset = employment_type_dataframe


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(225493, 37)
(225493, 38)
   disbursed_amount         ...           average_account_age
0             50578         ...                      0.000000
1             47145         ...                      1.916667
2             53278         ...                      0.000000
3             57513         ...                      0.666667
4             52378         ...                      0.000000

[5 rows x 37 columns]
   Employment_Type_Salaried         ...           average_account_age
0                       1.0         ...                      0.000000
1                       0.0         ...                      1.916667
2                       0.0         ...                      0.000000
3                       0.0         ...                      0.666667
4                       0.0         ...                      0.000000

[5 rows x 38 columns]


In [20]:

#preprocessed_dataset.iloc[:,:-1].head()

#reordering the columns indices
preprocessed_dataset = preprocessed_dataset[['Employment_Type_Salaried', 'Employment_Type_Self_Employed',
       'disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'supplier_id',
       'manufacturer_id', 'State_ID', 'Employee_code_ID', 'Aadhar_flag',
       'PAN_flag', 'VoterID_flag', 'Driving_flag', 'Passport_flag',
       'PERFORM_CNS.SCORE', 'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS',
       'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE',
       'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS',
       'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE',
       'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT',
       'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'NO.OF_INQUIRIES',
        'age', 'credit_history_length', 'average_account_age', 'loan_default']]

#preprocessed_dataset.iloc[:,:-1].head()

#normalization of data
min_max_scaler = MinMaxScaler()
preprocessed_dataset = min_max_scaler.fit_transform(preprocessed_dataset)
preprocessed_dataset[:, :-1]


X_Train, X_Test, Y_Train, Y_Test = train_test_split(preprocessed_dataset[:,:-1], preprocessed_dataset[:, -1],
                                                   test_size = 0.33, random_state = 42)

clf = RandomForestClassifier(n_estimators=75, criterion='gini', max_depth=12, verbose = 1, max_features='sqrt')
clf.fit(X_Train, Y_Train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   29.7s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=75, n_jobs=None,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [21]:
prediction_value = clf.predict(X_Test)
prediction_value

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.8s finished


array([0., 0., 0., ..., 0., 0., 0.])

In [22]:
accuracy_score_ = accuracy_score(Y_Test, prediction_value, normalize=False)
balanced_accuracy_score_ = balanced_accuracy_score(Y_Test, prediction_value)
print(balanced_accuracy_score_)
print(accuracy_score_)

0.5018144897226565
58013
