**Upload train and test data**

In [1]:
from google.colab import files #upload the data
train = files.upload()
test = files.upload()

Saving train_indessa.csv to train_indessa.csv


Saving test_indessa.csv to test_indessa.csv


**Import all the required packages**

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import io
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score



**Data Preprocessing**

Cleaing the training data

In [3]:
train = pd.read_csv(io.BytesIO(train['train_indessa.csv']))

print(train.dtypes)
print("")
print(train.shape)
print("")
print(train['loan_status'].value_counts())
print("")
print(train.isnull().sum())

member_id                        int64
loan_amnt                        int64
funded_amnt                      int64
funded_amnt_inv                float64
term                            object
batch_enrolled                  object
int_rate                       float64
grade                           object
sub_grade                       object
emp_title                       object
emp_length                      object
home_ownership                  object
annual_inc                     float64
verification_status             object
pymnt_plan                      object
desc                            object
purpose                         object
title                           object
zip_code                        object
addr_state                      object
dti                            float64
delinq_2yrs                    float64
inq_last_6mths                 float64
mths_since_last_delinq         float64
mths_since_last_record         float64
open_acc                 

In [4]:
half_count = len(train) / 2
t1 = train.dropna(thresh=half_count,axis=1) # Drop any column with more than 50% missing values
t1.isnull().sum()

member_id                         0
loan_amnt                         0
funded_amnt                       0
funded_amnt_inv                   0
term                              0
batch_enrolled                85149
int_rate                          0
grade                             0
sub_grade                         0
emp_title                     30833
emp_length                    26891
home_ownership                    0
annual_inc                        3
verification_status               0
pymnt_plan                        0
purpose                           0
title                            90
zip_code                          0
addr_state                        0
dti                               0
delinq_2yrs                      16
inq_last_6mths                   16
open_acc                         16
pub_rec                          16
revol_bal                         0
revol_util                      287
total_acc                        16
initial_list_status         

In [5]:
drop_list = ['member_id','funded_amnt','funded_amnt_inv','batch_enrolled','int_rate','total_rec_int','total_rec_late_fee','last_week_pay','collections_12_mths_ex_med','emp_length','tot_coll_amt','tot_cur_bal','total_rev_hi_lim','sub_grade','pymnt_plan','emp_title','zip_code','application_type','recoveries','collection_recovery_fee']
t1 = t1.drop(drop_list,axis=1) # Drop columns that give futuristic information or are irrelevant to the model or give redundant information
print(t1.isnull().sum())
print("")
print(t1.shape)
print("")
print(t1['loan_status'].value_counts())
print("")

loan_amnt                0
term                     0
grade                    0
home_ownership           0
annual_inc               3
verification_status      0
purpose                  0
title                   90
addr_state               0
dti                      0
delinq_2yrs             16
inq_last_6mths          16
open_acc                16
pub_rec                 16
revol_bal                0
revol_util             287
total_acc               16
initial_list_status      0
acc_now_delinq          16
loan_status              0
dtype: int64

(532428, 20)

0    406601
1    125827
Name: loan_status, dtype: int64



In [6]:
t1 = t1.dropna() # Drop rows that have missing values

print(t1.isnull().sum())
print("")
print(t1.shape)
print("")
print(t1['loan_status'].value_counts())
print("")
print(t1.dtypes)
print("")

loan_amnt              0
term                   0
grade                  0
home_ownership         0
annual_inc             0
verification_status    0
purpose                0
title                  0
addr_state             0
dti                    0
delinq_2yrs            0
inq_last_6mths         0
open_acc               0
pub_rec                0
revol_bal              0
revol_util             0
total_acc              0
initial_list_status    0
acc_now_delinq         0
loan_status            0
dtype: int64

(532052, 20)

0    406328
1    125724
Name: loan_status, dtype: int64

loan_amnt                int64
term                    object
grade                   object
home_ownership          object
annual_inc             float64
verification_status     object
purpose                 object
title                   object
addr_state              object
dti                    float64
delinq_2yrs            float64
inq_last_6mths         float64
open_acc               float64
pub_rec     

Save the cleaned data to csv (if needed)

In [7]:
t1.to_csv("filtered.csv",index=False)
files.download("filtered.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Preparing train data for model training

In [7]:
filtered_loans = t1.copy()

drop_cols = ['addr_state','title']
filtered_loans = filtered_loans.drop(drop_cols,axis=1)

mapping_dict = {
"grade":{
"A": 1,
"B": 2,
"C": 3,
"D": 4,
"E": 5,
"F": 6,
"G": 7
},
"term":{
    "36 months":1,
    "60 months":2
},
"verification_status":{
    "Source Verified":1,
    "Verified":1,
    "Not Verified":2
},
"initial_list_status":{
    "f":0,
    "w":1
}
}
filtered_loans = filtered_loans.replace(mapping_dict)
filtered_loans[['grade']].head()

nominal_columns = ["home_ownership", "purpose"]
dummy_df = pd.get_dummies(filtered_loans[nominal_columns])
filtered_loans = pd.concat([filtered_loans, dummy_df], axis=1)
filtered_loans = filtered_loans.drop(nominal_columns, axis=1)

print(filtered_loans.shape)
print("")
print(filtered_loans['loan_status'].value_counts())
print("")
print(filtered_loans.dtypes)

(532052, 36)

0    406328
1    125724
Name: loan_status, dtype: int64

loan_amnt                       int64
term                            int64
grade                           int64
annual_inc                    float64
verification_status             int64
dti                           float64
delinq_2yrs                   float64
inq_last_6mths                float64
open_acc                      float64
pub_rec                       float64
revol_bal                     float64
revol_util                    float64
total_acc                     float64
initial_list_status             int64
acc_now_delinq                float64
loan_status                     int64
home_ownership_ANY              uint8
home_ownership_MORTGAGE         uint8
home_ownership_NONE             uint8
home_ownership_OTHER            uint8
home_ownership_OWN              uint8
home_ownership_RENT             uint8
purpose_car                     uint8
purpose_credit_card             uint8
purpose_debt_cons

Save the final data for training

In [8]:
filtered_loans.to_csv("train.csv",index=False)
files.download("train.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Cleaning the test data

In [9]:
test = pd.read_csv(io.BytesIO(test['test_indessa.csv']))

print(test.dtypes)
print("")
print(test.shape)
print("")
print(test.isnull().sum())

member_id                        int64
loan_amnt                        int64
funded_amnt                      int64
funded_amnt_inv                float64
term                            object
batch_enrolled                  object
int_rate                       float64
grade                           object
sub_grade                       object
emp_title                       object
emp_length                      object
home_ownership                  object
annual_inc                     float64
verification_status             object
pymnt_plan                      object
desc                            object
purpose                         object
title                           object
zip_code                        object
addr_state                      object
dti                            float64
delinq_2yrs                    float64
inq_last_6mths                 float64
mths_since_last_delinq         float64
mths_since_last_record         float64
open_acc                 

In [10]:
drop_list = ['funded_amnt','funded_amnt_inv','batch_enrolled','int_rate','sub_grade','emp_title','pymnt_plan','desc','title','zip_code','addr_state','mths_since_last_delinq','mths_since_last_record','total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee','collections_12_mths_ex_med','mths_since_last_major_derog','application_type','verification_status_joint','last_week_pay','emp_length','tot_coll_amt','tot_cur_bal','total_rev_hi_lim']
test = test.drop(drop_list,axis=1)

test = test.dropna()

mapping_dict = {
"grade":{
"A": 1,
"B": 2,
"C": 3,
"D": 4,
"E": 5,
"F": 6,
"G": 7
},
"term":{
    "36 months":1,
    "60 months":2
},
"verification_status":{
    "Source Verified":1,
    "Verified":1,
    "Not Verified":2
},
"initial_list_status":{
    "f":0,
    "w":1
}
}
test = test.replace(mapping_dict)
test[['grade']].head()

nominal_columns = ["home_ownership", "purpose"]
dummy_df = pd.get_dummies(test[nominal_columns])
test = pd.concat([test, dummy_df], axis=1)
test = test.drop(nominal_columns, axis=1)

In [11]:
idx = 16
new_col = np.empty(354736)
new_col.fill(0)
test.insert(loc=idx, column= "home_ownership_ANY", value=new_col)

print(test.head)

<bound method NDFrame.head of         member_id  loan_amnt  ...  purpose_vacation  purpose_wedding
0        11937648      14000  ...                 0                0
1        38983318      16000  ...                 0                0
2        27999917      11050  ...                 0                0
3        61514932      35000  ...                 0                0
4        59622821       6500  ...                 0                0
...           ...        ...  ...               ...              ...
354946   19145105      15000  ...                 0                0
354947   46304777      35000  ...                 0                0
354948     903745      14000  ...                 0                0
354949   53032475      20000  ...                 0                0
354950     994245       2700  ...                 0                0

[354736 rows x 36 columns]>


Save the test data

In [12]:
test.to_csv("test.csv",index=False)
files.download("test.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Training the model**

In [13]:
filtered_loans.rename(columns={"loan_status": "appstat"}, inplace=True)

f1 = filtered_loans.copy()

y = filtered_loans.appstat

filtered_loans.drop(columns = 'appstat', inplace=True)

X = filtered_loans.copy()

Over Sampling the data 

In [14]:
smote = SMOTE()

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X, y)

#print('Original dataset shape', Counter(y))
#print('Resample dataset shape', Counter(y_ros))

print(X.shape)
print(x_smote.shape)
print(y.shape)
print(y_smote.shape)



(532052, 35)
(812656, 35)
(532052,)
(812656,)


Model

In [15]:
x_train, x_cv, y_train, y_cv = train_test_split(x_smote,y_smote, test_size=0.2, random_state=4)

model = LogisticRegression()
model.fit(x_train, y_train)
#LogisticRegression()


print(roc_auc_score(y, model.predict_proba(X)[:, 1]))

print(roc_auc_score(y, model.decision_function(X)))

pred_cv = model.predict(x_cv)
print(accuracy_score(y_cv,pred_cv))

0.5973238861395661
0.5973238861395661
0.5703369182684025


Test data prediction

In [16]:
test.rename(columns={"member_id": "mid"}, inplace=True)

t2 = test.copy()

mid = test.mid

test.drop(columns = 'mid', inplace=True)

pred = model.predict(test)

for i in pred:
  print(i)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1
0
0
0
0
1
0
1
0
0
0
0
0
0
0
1
0
0
1
1
0
0
0
1
1
1
0
1
1
0
0
0
1
0
0
0
0
1
0
0
1
0
1
0
1
1
0
0
1
1
1
1
1
1
1
1
1
1
0
0
0
0
1
1
0
0
0
1
1
1
1
0
1
1
1
0
0
1
0
0
1
1
1
0
0
1
1
1
1
0
0
1
1
1
0
0
0
0
0
0
0
0
1
1
1
0
1
0
0
1
0
1
1
0
1
1
1
1
0
1
1
0
0
1
1
1
1
0
0
1
0
0
0
0
1
0
1
0
0
0
1
0
0
0
1
1
0
1
1
0
1
0
0
0
1
0
1
1
0
1
1
0
1
1
1
0
0
0
1
0
1
0
0
0
1
1
0
0
0
0
1
1
1
0
1
1
1
0
0
1
1
1
1
0
0
0
1
0
1
0
0
0
0
0
0
0
1
0
1
0
1
0
0
1
1
1
0
0
0
0
0
1
0
0
0
0
0
1
0
0
1
1
0
1
1
1
0
0
0
1
0
1
0
1
0
0
0
1
1
0
0
0
0
0
0
0
1
1
1
0
0
0
0
1
0
1
0
1
1
1
1
1
1
1
1
1
0
0
0
1
0
0
1
0
1
0
1
0
0
1
1
1
0
0
0
0
0
0
0
1
0
1
0
1
1
0
0
0
1
1
0
0
1
1
0
0
0
1
0
0
1
1
0
0
1
0
0
1
1
0
1
1
1
0
1
0
0
0
0
0
0
0
1
1
1
1
1
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
1
1
0
1
1
1
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
1
0
1
1
1
0
1
0
0
1
0
0
1
0
0
0
0
1
0
1
0
0
0
0
1
0
0
0
0
0
0
1
0
0
1
0
0
1
0
0
0
0
1
0
0
1
0
0
1
0
0
0
0
1
1
1
1
0
0
0
1
0
1
1
0
1
0
0
0
0
1
1
0
1
1
1
0
1
1
1
1
0
1
0

**Creating the output file**

In [17]:
n1 = 'member_id'
n2 = 'loan_status'

data = {n1 : mid, n2 : pred}

df = pd.DataFrame(data)

df

Unnamed: 0,member_id,loan_status
0,11937648,1
1,38983318,1
2,27999917,0
3,61514932,0
4,59622821,1
...,...,...
354946,19145105,0
354947,46304777,0
354948,903745,1
354949,53032475,0


In [18]:
df.to_csv("output.csv",index=False)
files.download("output.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
print(df['loan_status'].value_counts())

0    211767
1    142969
Name: loan_status, dtype: int64
