## Importing libraries and data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('train.csv')
x_train=data.iloc[:,0:-1]
y_train=data.iloc[:,-1]
x_test = pd.read_csv('x_test.csv')
y_test = pd.read_csv('y_test.csv')

In [None]:
data.head()

Unnamed: 0,loan_id,age,education,proof_submitted,loan_amount,asset_cost,no_of_loans,no_of_curr_loans,last_delinq_none,loan_default
0,1,27,1.0,Aadhar,504264,820920,2,2,0,0
1,2,48,1.0,Aadhar,728556,831444,6,2,0,0
2,3,30,2.0,VoterID,642936,826092,0,0,0,1
3,4,28,1.0,Aadhar,746556,930924,0,0,0,0
4,5,29,1.0,Aadhar,1139880,1902000,0,0,0,0


In [None]:
data.columns.tolist()

['loan_id',
 'age',
 'education',
 'proof_submitted',
 'loan_amount',
 'asset_cost',
 'no_of_loans',
 'no_of_curr_loans',
 'last_delinq_none',
 'loan_default']

## Pre-processing

### Null values

In [None]:
x_train.isnull().sum()

loan_id               0
age                   0
education           245
proof_submitted       0
loan_amount           0
asset_cost            0
no_of_loans           0
no_of_curr_loans      0
last_delinq_none      0
dtype: int64

In [None]:
x_train=x_train.fillna(data.mean())
# data = data.dropna()    --if you want to remove the rows with null values

  x_train=x_train.fillna(data.mean())


In [None]:
x_train.isnull().sum()

loan_id             0
age                 0
education           0
proof_submitted     0
loan_amount         0
asset_cost          0
no_of_loans         0
no_of_curr_loans    0
last_delinq_none    0
dtype: int64

In [None]:
x_test.isnull().sum()

loan_id              0
age                  0
education           92
proof_submitted      0
loan_amount          0
asset_cost           0
no_of_loans          0
no_of_curr_loans     0
last_delinq_none     0
dtype: int64

In [None]:
x_test = x_test.fillna(x_test.mean())

  x_test = x_test.fillna(x_test.mean())


### Duplicates

In [None]:
data.duplicated().sum()

0

In [None]:
x_test.duplicated().sum()

0

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
x_train['proof_submitted'] = lb.fit_transform(x_train['proof_submitted'])
x_test['proof_submitted'] = lb.fit_transform(x_test['proof_submitted'])

In [None]:
x_train.head()

Unnamed: 0,loan_id,age,education,proof_submitted,loan_amount,asset_cost,no_of_loans,no_of_curr_loans,last_delinq_none
0,1,27,1.0,0,504264,820920,2,2,0
1,2,48,1.0,0,728556,831444,6,2,0
2,3,30,2.0,4,642936,826092,0,0,0
3,4,28,1.0,0,746556,930924,0,0,0
4,5,29,1.0,0,1139880,1902000,0,0,0


### Standardization
values are not in the range of 0 to 1.

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit_transform(x_train)
sc.transform(x_test)

array([[ 1.73229826, -0.93533993, -1.15299378, ..., -0.1559498 ,
         0.28726461, -0.11603707],
       [ 1.73279313, -1.0671416 , -1.15299378, ...,  0.02681407,
         0.74406886, -0.11603707],
       [ 1.733288  , -1.0671416 , -1.15299378, ..., -0.52147754,
        -0.62634388, -0.11603707],
       ...,
       [ 3.21542864,  0.90988338, -1.15299378, ..., -0.1559498 ,
         0.28726461, -0.11603707],
       [ 3.21592351, -0.53993494,  0.89876412, ..., -0.52147754,
        -0.62634388, -0.11603707],
       [ 3.21641838,  0.77808172,  0.89876412, ...,  0.39234181,
         1.2008731 , -0.11603707]])

## Creating a model

In [None]:
from sklearn.linear_model import LogisticRegression
lc=LogisticRegression(C=5)
lc.fit(x_train,y_train)
y_predict=lc.predict(x_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=120,criterion='entropy',max_depth=100, min_samples_split=7)
rf.fit(x_train,y_train)
y_predict = rf.predict(x_test)

In [None]:
from sklearn.svm import SVC
svc_model=SVC()
svc_model.fit(x_train,y_train)
y_predict = svc_model.predict(x_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 4, algorithm='brute',weights='uniform',metric='chebyshev')
knn.fit(x_train,y_train)
y_predict = knn.predict(x_test)

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train,y_train)
y_predict = nb.predict(x_test)

In [None]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(hidden_layer_sizes=(100, 50))
nn.fit(x_train,y_train)
y_predict=nn.predict(x_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=300)
nn.fit(x_train,y_train)
y_predict=nn.predict(x_test)

In [None]:
y_test = y_test.iloc[:,-1]

## Evaluation

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

n_error = (y_predict!=y_test).sum()
acc= accuracy_score(y_test,y_predict)
f1 = f1_score(y_test,y_predict)
recall=recall_score(y_test,y_predict)
prec = precision_score(y_test,y_predict)
print("No. of errors in the predicted values are {} out of {} values".format(n_error,y_test.shape[0]))
print("The accuracy is {:.3f}".format(acc))
print("The f1 score is {:.3f}".format(f1))
print("The Precision is {:.3f}".format(prec))
print("The recall is {:.3f}".format(recall))

No. of errors in the predicted values are 2984 out of 3000 values
The accuracy is 0.005
The f1 score is 0.011
The Precision is 1.000
The recall is 0.005
