In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.preprocessing import MinMaxScaler,StandardScaler


In [2]:
data_train=pd.read_csv('train_ctrUa4K.csv')
data_test=pd.read_csv('test_lAUu6dG.csv')
y=data_train.iloc[:,-1]
data_train.drop(['Loan_Status'],axis=1,inplace=True)

In [3]:
data_train.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban
613,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban


In [4]:
data_train.shape,data_test.shape,y.shape

((614, 12), (367, 12), (614,))

In [5]:
final_data=pd.concat([data_train,data_test],axis=0)

In [6]:
final_data.shape

(981, 12)

In [7]:
final_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [8]:
final_data.isnull().sum()/len(final_data)*100

Loan_ID              0.000000
Gender               2.446483
Married              0.305810
Dependents           2.548420
Education            0.000000
Self_Employed        5.606524
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           2.752294
Loan_Amount_Term     2.038736
Credit_History       8.053007
Property_Area        0.000000
dtype: float64

In [9]:
final_data['Gender']=final_data['Gender'].fillna(final_data['Gender'].mode()[0])
final_data['Married']=final_data['Married'].fillna(final_data['Married'].mode()[0])
final_data['Dependents']=final_data['Dependents'].fillna(final_data['Dependents'].mode()[0])
final_data['Self_Employed']=final_data['Self_Employed'].fillna(final_data['Self_Employed'].mode()[0])
final_data['LoanAmount']=final_data['LoanAmount'].fillna(final_data['LoanAmount'].mean())
final_data['Loan_Amount_Term']=final_data['Loan_Amount_Term'].fillna(final_data['Loan_Amount_Term'].mean())
final_data['Credit_History']=final_data['Credit_History'].fillna(final_data['Credit_History'].mean())

In [10]:
final_data.nunique()

Loan_ID              981
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      752
CoapplicantIncome    437
LoanAmount           233
Loan_Amount_Term      13
Credit_History         3
Property_Area          3
dtype: int64

In [11]:
final_data.replace({'Married':{'No':0,'Yes':1},'Gender':{'Male':1,'Female':0},'Self_Employed':{'No':0,'Yes':1},
                      'Property_Area':{'Rural':1,'Semiurban':2,'Urban':3},'Education':{'Graduate':1,'Not Graduate':0}},inplace=True)

In [12]:
final_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001002,1,0,0,1,0,5849,0.0,142.51153,360.0,1.0,3
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,1
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,3
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,3
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,3


In [13]:
final_data['Coapplicant']= [0 if x==0.0 else 1 for x in final_data['CoapplicantIncome']]

In [14]:
final_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Coapplicant
0,LP001002,1,0,0,1,0,5849,0.0,142.51153,360.0,1.0,3,0
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,1,1
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,3,0
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,3,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,3,0


In [15]:
final_data['Credit_History']=round(final_data['Credit_History'])


In [16]:
final_data = final_data.replace(to_replace='3+', value=4)


In [17]:
y=pd.DataFrame(y)
y=y.replace({'Loan_Status':{'N':0,'Y':1}})

In [18]:
print(final_data['Dependents'].value_counts())
print(final_data['Gender'].value_counts())
print(final_data['Married'].value_counts())
print(final_data['Education'].value_counts())
print(final_data['Self_Employed'].value_counts())
print(final_data['Credit_History'].value_counts())
print(final_data['Property_Area'].value_counts())
print(final_data['Credit_History'].value_counts())

0    570
2    160
1    160
4     91
Name: Dependents, dtype: int64
1    799
0    182
Name: Gender, dtype: int64
1    634
0    347
Name: Married, dtype: int64
1    763
0    218
Name: Education, dtype: int64
0    862
1    119
Name: Self_Employed, dtype: int64
1.0    833
0.0    148
Name: Credit_History, dtype: int64
2    349
3    342
1    290
Name: Property_Area, dtype: int64
1.0    833
0.0    148
Name: Credit_History, dtype: int64


In [19]:
final_data.drop(['Loan_ID'],axis=1,inplace=True)

In [20]:
data_train=final_data.iloc[:614,:]
data_test=final_data.iloc[614:,:]

In [21]:
data_train.shape,data_test.shape

((614, 12), (367, 12))

In [22]:
x_train, x_test, y_train, y_test = train_test_split(data_train,y,test_size=0.2,random_state=10)

In [23]:
norm = MinMaxScaler().fit(data_train)
x_train = norm.transform(x_train)
x_test= norm.transform(x_test)
data_test= norm.transform(data_test)


In [24]:
dt = DecisionTreeClassifier()
dt = dt.fit(x_train,y_train)
y_prd = dt.predict(x_test)
ac_dt = accuracy_score(y_test,y_prd)
print('accuracy in training test:',ac_dt)

accuracy in training test: 0.7235772357723578


In [26]:
  output=dt.predict(data_test).astype(int)
  df_output=pd.DataFrame()
  aux=pd.read_csv('test_lAUu6dG.csv')
  df_output['Loan_ID']=aux['Loan_ID']
  df_output['Loan_Status']=output
  df_output['Loan_Status'].replace(0,'N',inplace=True)
  df_output['Loan_Status'].replace(1,'Y',inplace=True)
  df_output[['Loan_ID','Loan_Status']].to_csv('output.csv',index=False)