In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing the necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# Reading the dataset

df = pd.read_csv("/kaggle/input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv")

df.head()

In [None]:
df.shape

So there is 614 rows and 13 columns in the above datasets



In [None]:
# Describing the Dataset

df.describe()

In [None]:
# getting info about the datasets

df.info()

In [None]:
# To look for the null values

df.isnull().sum()

We can see that the data is having some values

In [None]:
df['Gender'].unique()

In [None]:
df['Gender'].value_counts()

In [None]:
df['Gender'] = df['Gender'].fillna("Female")
df.head()

In [None]:
df['Gender'].isnull().sum()

In [None]:
df['Married'].value_counts()

In [None]:
df['Married'] = df['Married'].fillna("No")

In [None]:
df['Dependents'].value_counts()

In [None]:
df['Dependents'] = df['Dependents'].fillna("3+")

In [None]:
df['Self_Employed'].value_counts()

In [None]:
df['Self_Employed'] = df['Self_Employed'].fillna("Yes")

In [None]:
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())

In [None]:
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())

In [None]:
df['Credit_History'].value_counts()

In [None]:
df['Credit_History'] = df['Credit_History'].fillna(0.0)

In [None]:
df.isnull().sum()

After handling the missing data individually we can see that there is no null values so we can do the remaining analysis smoothly

### Preprocessing the Data

In [None]:
# Lets separate the categorical and numerical columns 

cat_data = []
num_data = []

for i,c in enumerate(df.dtypes):
    if c == object:
        cat_data.append(df.iloc[:, i])
    else:
        num_data.append(df.iloc[:, i])

In [None]:
# Converting them into dataframe

cat_data = pd.DataFrame(cat_data).transpose()
num_data = pd.DataFrame(num_data).transpose()

In [None]:
# Numerical Data

num_data.head()

In [None]:
num_data.isnull().sum().any()

Since we have already taken care of all the missing values so there is False values 

In [None]:
# Categorical Data

cat_data.head()

In [None]:
cat_data.isnull().sum().any()

Since we have already taken care of all the missing values so there is False values  

In [None]:
# Dropping the ID Column from cat_data
cat_data.drop('Loan_ID', axis = 1, inplace = True)

Using Label Encoder for the Categorical Columns

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
# Transforming the Categorical Columns

for i in cat_data:
    cat_data[i] = le.fit_transform(cat_data[i])

In [None]:
cat_data.head()

In [None]:
mapping = {
    1.0 : 1,
    0.0 : 0
}

In [None]:
num_data.loc[:, "Credit_History"] = num_data.Credit_History.map(mapping)

In [None]:
num_data.head()

In [None]:
# concating the both the numerical and categorical column after the operations

df = pd.concat([cat_data, num_data], axis = 1)

In [None]:
df.head()

## Training

In [None]:
X = df.drop('Loan_Status', axis = 1)
y = df['Loan_Status']

In [None]:
# Splitting the Data into train-test split

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


    
print('X_train shape', X_train.shape)
print('y_train shape', y_train.shape)
print('X_test shape', X_test.shape)
print('y_test shape', y_test.shape)


In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)


In [None]:
# Using the various model for training

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


models = {
    'LogisticRegression' : LogisticRegression(random_state = 42),
    'KNeighborsClassifier' : KNeighborsClassifier(),
    'SVC' : SVC(random_state = 42),
    'DecisionTreeClassifier' : DecisionTreeClassifier(max_depth = 5, random_state = 42)
}

In [None]:
# Building the Functions

from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, accuracy_score

def loss(y_true, y_pred, retu=False):
    pre = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    loss = log_loss(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    
    if retu:
        return pre, rec, f1, loss, acc
    else:
        print('  pre: %.4f\n  rec: %.4f\n  f1: %.4f\n  loss: %.4f\n  acc: %.4f' % (pre, rec, f1, loss, acc))

In [None]:
# Evaluating the model

def train_eval(models, X, y):
    for name, model in models.items():
        print(name, ':')
        model.fit(X, y)
        loss(y, model.predict(X))
        print('#'*40)

In [None]:
train_eval(models, X_train, y_train)

In [None]:
# Usingthe Stratified K Fold to split the model

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

def train_eval(models, X, y, folds):
    # Since (iloc don't work on numpy array) we will change X & y to dataframe because we will use iloc
    X = pd.DataFrame(X)
    y = pd.DataFrame(y)
    
    idx = [' pre', ' rec', ' f1', ' loss', ' acc']
    for name, model in models.items():
        ls = []
        print(name, ':')
        
        for train, test in folds.split(X, y):
            model.fit(X.iloc[train], y.iloc[train])
            y_pred = model.predict(X.iloc[test])
            ls.append(loss(y.iloc[test], y_pred, retu = True))
            
        print(pd.DataFrame(np.array(ls).mean(axis = 0), index = idx)[0])
        
        print('#'*40)

In [None]:
train_eval(models, X_train, y_train, skf)


If you liked this notebook please ipvote