In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score,accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
trainData = pd.read_csv('/Users/pragati/OneDrive - Northeastern University/Nirma_Python_Project/ML_LoanPrediction-master/Data/train_u6lujuX_CVtuZ9i.csv')

In [3]:
trainData.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


In [4]:
trainData.shape

(614, 13)

In [5]:
trainData.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
trainData.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [7]:
#Handling with missing data

In [8]:
trainData.Gender.fillna(trainData.Gender.dropna().max(),inplace =True)
trainData.Gender.fillna(trainData.Married.dropna().max(),inplace =True)

In [9]:
trainData.Credit_History.fillna(trainData.Credit_History.max(),inplace=True)

In [10]:
trainData.LoanAmount.fillna(trainData.LoanAmount.mean(),inplace=True)

In [11]:
trainData.Loan_Amount_Term.fillna(trainData.Loan_Amount_Term.mean(),inplace=True)

In [12]:
trainData.Self_Employed.fillna(trainData.Self_Employed.dropna().max(),inplace=True)

In [13]:
trainData.Dependents.fillna(0,inplace=True)

# Convert string values to numerical values because to algorithm can understand only numerical value not string values

In [14]:
trainData.Gender.value_counts()
gender_cat = pd.get_dummies(trainData.Gender,prefix='gender').gender_Female

In [15]:
trainData.Married.value_counts()
married_category = pd.get_dummies(trainData.Married,prefix='marriage').marriage_Yes

In [16]:
trainData.Education.value_counts()
graduate_category = pd.get_dummies(trainData.Education,prefix='education').education_Graduate

In [17]:
trainData.Self_Employed.value_counts()
self_emp_category = pd.get_dummies(trainData.Self_Employed,prefix='employed').employed_Yes

In [18]:
loan_status = pd.get_dummies(trainData.Loan_Status,prefix='status').status_Y

In [19]:
property_category = pd.get_dummies(trainData.Property_Area,prefix='property')

In [20]:
trainData.shape

(614, 13)

In [21]:
trainNew = pd.concat([trainData,gender_cat,married_category,graduate_category,self_emp_category,loan_status,property_category],axis=1)

In [22]:
trainNew.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,...,Property_Area,Loan_Status,gender_Female,marriage_Yes,education_Graduate,employed_Yes,status_Y,property_Rural,property_Semiurban,property_Urban
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,...,Urban,Y,0,0,1,0,1,0,0,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,...,Rural,N,0,1,1,0,0,1,0,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,...,Urban,Y,0,1,1,1,1,0,0,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,...,Urban,Y,0,1,0,0,1,0,0,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,...,Urban,Y,0,0,1,0,1,0,0,1


In [23]:
trainNew.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'gender_Female', 'marriage_Yes', 'education_Graduate', 'employed_Yes',
       'status_Y', 'property_Rural', 'property_Semiurban', 'property_Urban'],
      dtype='object')

In [24]:
feature_columns = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','gender_Female','marriage_Yes','education_Graduate','employed_Yes','property_Rural','property_Semiurban','property_Urban']

In [25]:
X = trainNew[feature_columns]

In [26]:
y =  trainNew['status_Y']
y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: status_Y, Length: 614, dtype: uint8

In [27]:
from sklearn.model_selection import train_test_split,cross_val_predict,cross_val_score

In [28]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.01,random_state=42)

In [29]:
X_train.shape

(607, 12)

In [30]:
X_test.shape

(7, 12)

In [31]:
randForest = RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth=7, max_features=1)
randForest.fit(X_train,y_train)
y_pred_class  = randForest.predict(X_test)
randForestScore = accuracy_score(y_test,y_pred_class)
%time print("Random forest accuraccy score",randForestScore)

Random forest accuraccy score 1.0
CPU times: user 80 µs, sys: 48 µs, total: 128 µs
Wall time: 123 µs


# Import test data and do real test of our model

In [32]:
randForestNew = RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth=7, max_features=1)
randForestNew.fit(X,y)

RandomForestClassifier(max_depth=7, max_features=1, min_samples_split=25,
                       n_estimators=25)

In [33]:
testData = pd.read_csv('/Users/pragati/OneDrive - Northeastern University/Nirma_Python_Project/ML_LoanPrediction-master/Data/test_Y3wMUE5_7gLdaTN.csv')

In [34]:
testData.shape

(367, 12)

In [35]:
testData.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [36]:
testData.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [37]:
testData.Gender.fillna(testData.Gender.dropna().max(),inplace =True)
testData.Married.fillna(testData.Married.dropna().max(),inplace=True)
testData.Credit_History.fillna(testData.Credit_History.dropna().max(),inplace=True)
testData.LoanAmount.fillna(testData.LoanAmount.mean(),inplace=True)
testData.Loan_Amount_Term.fillna(testData.Loan_Amount_Term.mean(),inplace=True)
testData.Self_Employed.fillna(testData.Self_Employed.dropna().max(),inplace=True)
testData.Dependents.fillna(0,inplace=True)

In [38]:
gender_cat = pd.get_dummies(testData.Gender,prefix='gender').gender_Female
married_category = pd.get_dummies(testData.Married,prefix='marriage').marriage_Yes
graduate_category = pd.get_dummies(testData.Education,prefix='education').education_Graduate
self_emp_category = pd.get_dummies(testData.Self_Employed,prefix='employed').employed_Yes
property_category = pd.get_dummies(testData.Property_Area,prefix='property')

In [39]:
testDataNew = pd.concat([testData,gender_cat,married_category,graduate_category,self_emp_category,property_category],axis=1)

In [40]:
X_testData = testDataNew[feature_columns]

In [41]:
X_testData.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,gender_Female,marriage_Yes,education_Graduate,employed_Yes,property_Rural,property_Semiurban,property_Urban
0,5720,0,110.0,360.0,1.0,0,1,1,0,0,0,1
1,3076,1500,126.0,360.0,1.0,0,1,1,0,0,0,1
2,5000,1800,208.0,360.0,1.0,0,1,1,0,0,0,1
3,2340,2546,100.0,360.0,1.0,0,1,1,0,0,0,1
4,3276,0,78.0,360.0,1.0,0,0,0,0,0,0,1


In [42]:
y_test_pread_class = randForestNew.predict(X_testData)

In [43]:
randForestFormat = ["Y" if i == 1 else "N" for i in y_test_pread_class ]

In [44]:
pd.DataFrame({'Loan_ID':testData.Loan_ID,'Loan_Status':randForestFormat}).to_csv('radom_forest_submission.csv',index=False)

# Solve using logistic regression

In [45]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
logReg.fit(X_train,y_train)
logREg_predict =logReg.predict(X_test)
accuracy_score(y_test,logREg_predict)

1.0

In [46]:
logReg_y_prediction_class = logReg.predict(X_testData)

In [47]:
logRegPredictionFormat = ["Y" if i == 1 else "N" for i in logReg_y_prediction_class ]

# zip(logRegPredictionFormat,logReg_y_prediction_class)

In [48]:
pd.DataFrame({'Loan_ID':testData.Loan_ID,'Loan_Status':logRegPredictionFormat}).to_csv('logReg_submission.csv',index=False)

In [49]:
Prediction = pd.DataFrame({'Loan_ID':testData.Loan_ID,'LoanAmount':testData.LoanAmount,'Credit_History':testData.Credit_History,'Loan_Status':logRegPredictionFormat})

In [50]:
prediction_dummy_df = pd.DataFrame(logReg.predict(X_testData), columns = {'Prediction'}).set_index([pd.Index(X_testData.index)])
probability_dummy_df = pd.DataFrame(logReg.predict_proba(X_testData)).set_index([pd.Index(X_testData.index)])
all_dummy_df = X_testData.join(prediction_dummy_df).join(probability_dummy_df)
all_dummy_df

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,gender_Female,marriage_Yes,education_Graduate,employed_Yes,property_Rural,property_Semiurban,property_Urban,Prediction,0,1
0,5720,0,110.0,360.0,1.0,0,1,1,0,0,0,1,1,0.195186,0.804814
1,3076,1500,126.0,360.0,1.0,0,1,1,0,0,0,1,1,0.216342,0.783658
2,5000,1800,208.0,360.0,1.0,0,1,1,0,0,0,1,1,0.249533,0.750467
3,2340,2546,100.0,360.0,1.0,0,1,1,0,0,0,1,1,0.217819,0.782181
4,3276,0,78.0,360.0,1.0,0,0,0,0,0,0,1,1,0.334721,0.665279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,4009,1777,113.0,360.0,1.0,0,1,0,1,0,0,1,1,0.285172,0.714828
363,4158,709,115.0,360.0,1.0,0,1,1,0,0,0,1,1,0.204241,0.795759
364,3250,1993,126.0,360.0,1.0,0,0,1,0,0,1,0,1,0.179140,0.820860
365,5000,2393,158.0,360.0,1.0,0,1,1,0,1,0,0,1,0.238437,0.761563
