In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score,accuracy_score,classification_report

In [6]:
df = pd.read_excel('loan-data.xlsx')

In [8]:
df = df.drop(columns=['Unnamed: 0','id','member_id','zip_code','addr_state','last_credit_pull_d','next_pymnt_d','last_pymnt_d','earliest_cr_line','issue_d','funded_amnt_inv','dti','delinq_2yrs','inq_last_6mths','mths_since_last_delinq','open_acc','pub_rec','revol_bal','revol_util','total_acc','total_pymnt_inv'])

In [9]:
df.head()

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,total_pymnt,total_rec_prncp,total_rec_int,last_pymnt_amnt,repay_fail
0,0.0,0.0,36 months,0.0,0.0,< 1 year,RENT,0.0,Not Verified,Charged Off,major_purchase,0.0,0.0,0.0,0.0,1
1,2500.0,2500.0,36 months,13.98,85.42,4 years,RENT,20004.0,Not Verified,Does not meet the credit policy. Status:Fully ...,other,3075.291779,2500.0,575.29,90.85,0
2,5000.0,5000.0,36 months,15.95,175.67,4 years,RENT,59000.0,Not Verified,Charged Off,debt_consolidation,2948.76,1909.02,873.81,175.67,1
3,7000.0,7000.0,36 months,9.91,225.58,10+ years,MORTGAGE,53796.0,Not Verified,Fully Paid,other,8082.39188,7000.0,1082.39,1550.27,0
4,2000.0,2000.0,36 months,5.42,60.32,10+ years,RENT,30000.0,Not Verified,Fully Paid,debt_consolidation,2161.663244,2000.0,161.66,53.12,0


In [10]:
df.isna().sum()

loan_amnt                1
funded_amnt              1
term                     0
int_rate                 0
installment              1
emp_length             993
home_ownership           0
annual_inc               2
verification_status      0
loan_status              0
purpose                  0
total_pymnt              1
total_rec_prncp          1
total_rec_int            1
last_pymnt_amnt          1
repay_fail               0
dtype: int64

In [11]:
df.dropna(inplace= True)

In [13]:
df.isna().sum()

loan_amnt              0
funded_amnt            0
term                   0
int_rate               0
installment            0
emp_length             0
home_ownership         0
annual_inc             0
verification_status    0
loan_status            0
purpose                0
total_pymnt            0
total_rec_prncp        0
total_rec_int          0
last_pymnt_amnt        0
repay_fail             0
dtype: int64

In [21]:
df.term = df.term.str.split('').str[:-1].str.join('')
df.term = df.term.str.replace(",","").str.replace("months","").astype('int64')

In [22]:
df.term

0        36
1        36
2        36
3        36
4        36
         ..
38475    36
38476    36
38477    60
38478    36
38479    36
Name: term, Length: 37485, dtype: int64

In [23]:
df.head(3)

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,total_pymnt,total_rec_prncp,total_rec_int,last_pymnt_amnt,repay_fail
0,0.0,0.0,36,0.0,0.0,< 1 year,RENT,0.0,Not Verified,Charged Off,major_purchase,0.0,0.0,0.0,0.0,1
1,2500.0,2500.0,36,13.98,85.42,4 years,RENT,20004.0,Not Verified,Does not meet the credit policy. Status:Fully ...,other,3075.291779,2500.0,575.29,90.85,0
2,5000.0,5000.0,36,15.95,175.67,4 years,RENT,59000.0,Not Verified,Charged Off,debt_consolidation,2948.76,1909.02,873.81,175.67,1


In [25]:
df.emp_length.str.split('').str[:-1].str.join('')

0         < 1 year
1          4 years
2          4 years
3        10+ years
4        10+ years
           ...    
38475      3 years
38476      9 years
38477    10+ years
38478       1 year
38479      7 years
Name: emp_length, Length: 37485, dtype: object

In [27]:
df.emp_length = df.emp_length.str.replace('<','').str.replace('+','').str.replace('year','').str.replace('s','').astype('int')

In [28]:
df.emp_length

0         1
1         4
2         4
3        10
4        10
         ..
38475     3
38476     9
38477    10
38478     1
38479     7
Name: emp_length, Length: 37485, dtype: int32

In [30]:
verification_status_mapping = {
    'Not Verified': 0,
    'Verified': 1,
    'Source Verified': 1
}

df['verification_status'] = df['verification_status'].replace(verification_status_mapping)

In [32]:
df['home_ownership'] = df['home_ownership'].apply(lambda x: 'OTHER' if x=='NONE' else x)

In [34]:
X=df.drop(columns='repay_fail')
y = df['repay_fail']

In [35]:
X=pd.get_dummies(X)

In [36]:
X

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,emp_length,annual_inc,verification_status,total_pymnt,total_rec_prncp,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
0,0.0,0.0,36,0.00,0.00,1,0.0,0,0.000000,0.00,...,0,0,1,0,0,0,0,0,0,0
1,2500.0,2500.0,36,13.98,85.42,4,20004.0,0,3075.291779,2500.00,...,0,0,0,0,0,1,0,0,0,0
2,5000.0,5000.0,36,15.95,175.67,4,59000.0,0,2948.760000,1909.02,...,0,0,0,0,0,0,0,0,0,0
3,7000.0,7000.0,36,9.91,225.58,10,53796.0,0,8082.391880,7000.00,...,0,0,0,0,0,1,0,0,0,0
4,2000.0,2000.0,36,5.42,60.32,10,30000.0,0,2161.663244,2000.00,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38475,3000.0,3000.0,36,11.99,99.63,3,74250.0,1,3586.619764,3000.00,...,0,0,0,0,0,0,0,0,0,0
38476,10400.0,10400.0,36,13.49,352.88,9,62000.0,1,12703.534030,10400.00,...,0,0,0,0,0,0,0,0,0,0
38477,16000.0,10550.0,60,14.96,250.77,10,95088.0,1,14202.267530,10550.00,...,0,0,0,0,0,0,0,0,0,0
38478,10000.0,10000.0,36,16.89,355.99,1,48720.0,0,12815.178320,10000.00,...,0,0,0,0,0,0,0,0,0,0


In [37]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=0)

In [38]:
log_reg = LogisticRegression()

In [39]:
log_reg.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [41]:
log_reg.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
pred=log_reg.predict(X_test)
score=r2_score(y_test,pred)
score

0.962465709537254

In [44]:
acc=accuracy_score(y_test,pred)
acc

0.9954654574553214