# Applying K-Nearest Mode for Loan prediction

In [393]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [394]:
loans = pd.read_csv('train_u6lujuX.csv')
test_data = pd.read_csv('test_Y3wMUE5.csv')
loans.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0,,360,1,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508,128.0,360,1,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0,66.0,360,1,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358,120.0,360,1,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0,141.0,360,1,Urban,Y


### Features and Target
**Note:** Feature 'Dependents' has a value '3+' which is causing problem. Thats why it has been left for now.

In [395]:
features = ['Gender', 
            'Married', 
            'Dependents',
            'Education',
            'Self_Employed',
            'ApplicantIncome', 
            'CoapplicantIncome',
            'LoanAmount',
            'Loan_Amount_Term',
            'Credit_History',
            'Property_Area']
target = 'Loan_Status'

## Preparing the data

* **Is target column fully populated with proper data?**
* **Replace the target values to 1 and -1**

In [396]:
print "Is target column fully populated with proper data: ", (len(loans[loans[target] == 'Y']) + 
                                                              len(loans[loans[target] == 'N'])) == len(loans)
loans[target] = loans[target].apply(lambda x: 1 if x=='Y' else -1)

Is target column fully populated with proper data:  True


* **Subsample dataset to make sure classes are balanced**
* **Don't remove the rows where null values are there**

In [397]:
#safe_loans_raw = loans[loans[target] == 1]
#risky_loans_raw = loans[loans[target] == -1]

#loans = loans[pd.isnull(loans['Gender']) != True]
#print len(loans)

#loans = loans[pd.isnull(loans['Married']) != True]
#print len(loans)

#loans = loans[pd.isnull(loans['Education']) != True]
#print len(loans)

#loans = loans[pd.isnull(loans['Self_Employed']) != True]
#print len(loans)

loans = loans[pd.isnull(loans['ApplicantIncome']) != True]
print len(loans)

loans = loans[pd.isnull(loans['CoapplicantIncome']) != True]
print len(loans)

loans = loans[pd.isnull(loans['LoanAmount']) != True]
print len(loans)

loans = loans[pd.isnull(loans['Loan_Amount_Term']) != True]
print len(loans)

loans = loans[pd.isnull(loans['Credit_History']) != True]
print len(loans)

loans = loans[pd.isnull(loans['Property_Area']) != True]
print len(loans)


614
614
592
578
529
529


In [398]:
# Since there are less risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
#percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
#safe_loans = safe_loans_raw.sample(frac=percentage)
#risky_loans = risky_loans_raw
#loans_data = risky_loans.append(safe_loans)

#print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
#print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
#print "Total number of loans in our new dataset :", len(loans_data)
loans_data = loans

### Changing categorical values to numerical values

**Gender**

In [399]:
def fun_Gender(x):
    if x == 'Male':
        return 1
    elif x == 'Female':
        return 2
    else:
        return 0   
loans_data['Gender'] = loans_data['Gender'].apply(lambda x: fun_Gender(x))
test_data['Gender'] = test_data['Gender'].apply(lambda x: fun_Gender(x))
print loans_data['Gender'].head()

1    1
2    1
3    1
4    1
5    1
Name: Gender, dtype: int64


**Married**

In [400]:
def fun_Married(x):
    if x == 'Yes':
        return 1
    elif x == 'No':
        return 2
    else:
        return 0
loans_data['Married'] = loans_data['Married'].apply(lambda x: fun_Married(x))
test_data['Married'] = test_data['Married'].apply(lambda x: fun_Married(x))
print loans_data['Married'].head()

1    1
2    1
3    1
4    2
5    1
Name: Married, dtype: int64


**Dependents**

In [401]:
import numbers

def fun_Dependents(x):
    if x == '0':
        return 0
    elif x == '1':
        return 1
    elif x == '2':
        return 2
    elif x == '3+':
        return 3
    else:
        return 0

loans_data['Dependents'] = loans_data['Dependents'].apply(lambda x: fun_Dependents(x))
test_data['Dependents'] = test_data['Dependents'].apply(lambda x: fun_Dependents(x))
print loans_data['Dependents']

1      1
2      0
3      0
4      0
5      2
6      0
7      3
8      2
9      1
10     2
11     2
12     2
13     0
14     2
15     0
17     0
18     0
20     0
21     1
22     0
23     2
25     0
26     0
27     2
28     0
29     2
31     0
32     1
33     0
34     3
      ..
581    0
582    0
584    1
585    1
586    0
587    0
588    0
589    2
590    0
591    2
592    3
593    0
594    0
595    0
596    2
597    0
598    0
599    2
601    0
602    3
603    0
604    1
606    1
607    2
608    0
609    0
610    3
611    1
612    2
613    0
Name: Dependents, dtype: int64


**Education**

In [402]:
def fun_Education(x):
    if x == 'Graduate':
        return 1
    elif x == 'Not Graduate':
        return 2
    else:
        return 0
loans_data['Education'] = loans_data['Education'].apply(lambda x: fun_Education(x))
test_data['Education'] = test_data['Education'].apply(lambda x: fun_Education(x))
print loans_data['Education'].head()

1    1
2    1
3    2
4    1
5    1
Name: Education, dtype: int64


**Self_Employed**

In [403]:
def fun_Self_Employed(x):
    if x == 'Yes':
        return 1
    elif x == 'No':
        return 2
    else:
        return 0
loans_data['Self_Employed'] = loans_data['Self_Employed'].apply(lambda x: fun_Self_Employed(x))
test_data['Self_Employed'] = test_data['Self_Employed'].apply(lambda x: fun_Self_Employed(x))
print loans_data['Self_Employed'].head()

1    2
2    1
3    2
4    2
5    1
Name: Self_Employed, dtype: int64


**ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term & Credit_History**

In [404]:
loans_data['ApplicantIncome'] = loans_data['ApplicantIncome'].apply(lambda x: x if x >= 0 else 0)
test_data['ApplicantIncome'] = test_data['ApplicantIncome'].apply(lambda x: x if x >= 0 else 0)

loans_data['CoapplicantIncome'] = loans_data['CoapplicantIncome'].apply(lambda x: x if x >= 0 else 0)
test_data['CoapplicantIncome'] = test_data['CoapplicantIncome'].apply(lambda x: x if x >= 0 else 0)

loans_data['LoanAmount'] = loans_data['LoanAmount'].apply(lambda x: x if x >= 0 else 0)
test_data['LoanAmount'] = test_data['LoanAmount'].apply(lambda x: x if x >= 0 else 0)

loans_data['Loan_Amount_Term'] = loans_data['Loan_Amount_Term'].apply(lambda x: x if x >= 0 else 0)
test_data['Loan_Amount_Term'] = test_data['Loan_Amount_Term'].apply(lambda x: x if x >= 0 else 0)

loans_data['Credit_History'] = loans_data['Credit_History'].apply(lambda x: x if x >= 0 else 0)
test_data['Credit_History'] = test_data['Credit_History'].apply(lambda x: x if x >= 0 else 0)

print loans_data['ApplicantIncome'].head()
print loans_data['CoapplicantIncome'].head()
print loans_data['LoanAmount'].head()
print loans_data['Loan_Amount_Term'].head()
print loans_data['Credit_History'].head()

1    4583
2    3000
3    2583
4    6000
5    5417
Name: ApplicantIncome, dtype: int64
1    1508
2       0
3    2358
4       0
5    4196
Name: CoapplicantIncome, dtype: float64
1    128
2     66
3    120
4    141
5    267
Name: LoanAmount, dtype: float64
1    360
2    360
3    360
4    360
5    360
Name: Loan_Amount_Term, dtype: float64
1    1
2    1
3    1
4    1
5    1
Name: Credit_History, dtype: float64


**Property_Area**

In [405]:
def fun_Property_Area(x):
    if x == 'Urban':
        return 1
    elif x == 'Rural':
        return 2
    elif x == 'Semiurban':
        return 3
    else:
        return 0
loans_data['Property_Area'] = loans_data['Property_Area'].apply(lambda x: fun_Property_Area(x))
test_data['Property_Area'] = test_data['Property_Area'].apply(lambda x: fun_Property_Area(x))

print loans_data['Property_Area'].head()

1    2
2    1
3    1
4    1
5    1
Name: Property_Area, dtype: int64


## Training the model

In [406]:
X = loans_data[features]
y = loans_data[target]

In [407]:
model = LogisticRegression()
model.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

Test Data

## Making predictions using the model

In [408]:
X_test = test_data[features]
predictions = model.predict(X_test)
#predictions_proba = model.predict_proba(X_test)
print predictions
#print predictions_proba

[ 1  1  1 -1  1  1  1 -1  1  1  1  1 -1 -1  1  1  1  1  1  1  1  1  1  1  1
 -1 -1  1 -1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1
  1  1  1  1  1 -1  1  1 -1  1  1  1  1 -1  1  1 -1 -1  1 -1  1  1  1  1  1
  1  1  1  1  1 -1  1 -1  1 -1  1  1  1  1  1 -1  1  1  1 -1  1  1  1  1 -1
  1 -1  1  1 -1  1 -1  1  1  1  1  1  1  1  1 -1  1 -1 -1 -1  1  1  1 -1 -1
  1 -1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1  1 -1 -1  1  1  1 -1  1  1
  1  1  1 -1  1  1  1  1  1  1  1 -1  1  1 -1 -1 -1  1 -1  1  1  1  1 -1 -1
  1  1 -1  1 -1  1  1  1  1  1 -1  1  1  1  1  1  1 -1 -1  1  1 -1  1 -1  1
  1  1 -1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1 -1  1  1  1 -1
  1  1  1  1 -1  1  1  1  1  1 -1 -1  1  1  1  1 -1  1 -1  1 -1  1  1  1  1
 -1  1  1  1  1 -1  1  1  1 -1  1  1 -1  1  1 -1 -1  1 -1  1  1  1  1 -1 -1
  1  1  1 -1  1  1  1 -1  1  1  1 -1  1  1  1  1  1  1 -1  1  1  1  1  1  1
  1 -1  1  1  1 -1  1  1  1  1  1 -1  1  1  1  1  1 -1  1  1  1  1  1  1  1
 -1  1  1  1

## Writing the submission file

In [409]:
loan_id = test_data['Loan_ID']
predictionsS = pd.Series(predictions)
loan_status_predictions = predictionsS.apply(lambda x: 'Y' if x == 1 else 'N')

In [410]:
with open('submission.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['Loan_ID', 'Loan_Status'])
    for i in xrange(len(loan_id)):
        spamwriter.writerow([loan_id[i], loan_status_predictions[i]])

In [411]:
#print len(loans)

#x = np.arange(0, len(loans['ApplicantIncome'])) 
#x = loans['CoapplicantIncome']
#y = loans['Dependents']

#ll = plt.plot(x,y)
#plt.show()