In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import itertools
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/loandata/Loan payments data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.isnull().any()

In [None]:
df.dropna()

In [None]:
df['due_date'] = pd.to_datetime(df['due_date'])
df['effective_date'] = pd.to_datetime(df['effective_date'])
df.head()

In [None]:
df['loan_status'].value_counts()

In [None]:
bins = np.linspace(df.Principal.min(), df.Principal.max(), 10)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'Principal', bins=bins, ec="k")
g.axes[-1].legend()
plt.show()

In [None]:
bins = np.linspace(df.age.min(), df.age.max(), 10)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'age', bins=bins, ec="k")

g.axes[-1].legend()
plt.show()

In [None]:
df['dayofweek'] = df['effective_date'].dt.dayofweek

bins = np.linspace(df.dayofweek.min(), df.dayofweek.max(), 10)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'dayofweek', bins=bins, ec="k")
g.axes[-1].legend()
plt.show()

In [None]:
df.groupby(['Gender'])['loan_status'].value_counts(normalize=True)

In [None]:
df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
df.head()

In [None]:
df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)
df.head()

In [None]:
df.groupby(['education'])['loan_status'].value_counts(normalize=True)

In [None]:
df[['Principal','terms','age','Gender','education']].head()

In [None]:
Feature = df[['Principal','terms','age','Gender','weekend']]
Feature = pd.concat([Feature,pd.get_dummies(df['education'])], axis=1)
Feature.drop(['Master or Above'], axis = 1,inplace=True)
Feature.head()

In [None]:
X = Feature
feature_names = X.columns
print(feature_names)


In [None]:
y = df['loan_status'].values


In [None]:
X = preprocessing.StandardScaler().fit(X).transform(X)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=1)

**1. K-Nearest Neighbors (KNN)**

In [None]:
# Import KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Computing the accuracy for K values in range of 1 to 50

accuracy = []

for i in range(1,50):
    knn = KNeighborsClassifier(n_neighbors= i, weights='uniform')
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_val)
    acc = accuracy_score(y_val, pred_i)
    accuracy.append(acc)

In [None]:
# Accuracy vs K-value Plot

plt.figure(figsize=(15,6))
plt.plot(range(1,50), accuracy)
plt.xticks(list(range(1,50)), rotation=45)
plt.title("Accuracy vs K-value")
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Accuracy')
plt.grid()

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors = 19)
knn_clf.fit(X_train, y_train)
pred_knn = knn_clf.predict(X_val)
accuracy = accuracy_score(y_val, pred_knn)
print("accuracy: ", accuracy*100,'%')

**2. Decision Tree**

In [None]:
# Import Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

par_dt = {'max_depth': [1,2,3,4,5,6,7,8,9,10], 'max_features': [1,2,3,4,5,6,7,8], 'random_state': [0,10,20,40,60,100], 'criterion': ['gini','entropy'],}

grid_dt = GridSearchCV(DecisionTreeClassifier(), param_grid = par_dt, cv = 10)
grid_dt.fit(X_train, y_train)

print(f"Best Score: {grid_dt.best_score_}")
print(f"Best parameters: {grid_dt.best_params_}")

In [None]:
dt_clf = DecisionTreeClassifier(max_depth = 6, criterion = 'entropy', max_features = 4, random_state = 20)
dt_clf.fit(X_train, y_train)
pred_dt = dt_clf.predict(X_val)
accuracy = accuracy_score(y_val, pred_dt)
print("accuracy: ", accuracy*100,'%')

**3. Support Vector Machine**

In [None]:
# Import Support Vector Machine Classifier
from sklearn.svm import SVC

In [None]:
par_svm = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear','rbf','poly']} 

grid_svm = GridSearchCV( SVC(), param_grid = par_svm, cv = 10)
grid_svm.fit(X_train, y_train)

print(f"Best Score: {grid_svm.best_score_}")
print(f"Best parameters: {grid_svm.best_params_}")

In [None]:
svm_clf = SVC(C = 0.1, gamma = 1, kernel = 'linear')
svm_clf.fit(X_train, y_train)

pred_svm = dt_clf.predict(X_val)
accuracy = accuracy_score(y_val, pred_svm)
print("accuracy: ", accuracy*100,'%')

**4. Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
y_train_binary = (y_train == 'PAIDOFF').astype(np.int)
y_val_binary = (y_val == 'PAIDOFF').astype(np.int)
y_train_binary[0:5]

In [None]:
LR_clf = LogisticRegression(random_state = 0)
LR_clf.fit(X_train, y_train_binary)

pred_LR = LR_clf.predict(X_val)
accuracy = accuracy_score(y_val_binary, pred_LR)
print(pred_LR)
print("accuracy: ", accuracy*100,'%')