In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Logistic Regression on Personal Loan Dataset**

In this kernel, I tried to find that customers who are the securities customers will take the personal loan.

First step is understanding the dataset. Therefore, I need to do EDA in the data. I will try to find the relationship between the variable. With in the EDA part, I can be face with the inconsistency in the data. I will try to solve these problems and eliminate the inconsistency.

Second step is create the classification model. I will use the logistic regression in that part. In this kernel, I will not use the sklearn library to make a model. I will tried to establish logistic regression with using gradient descent method with using the cost function. 

Thats all, lets start with the first step.

In [None]:
# Import the necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression

In [None]:
# Taking the dataset into the dataframe
df = pd.read_csv("../input/bank-personal-loan-modelling/Bank_Personal_Loan_Modelling.csv")

In [None]:
# look the tables first and last five rows.
display(df.head())
print()
display(df.tail())

***Columns Description:***

There is no descriptions about the columns. Yet, I tried to explain the columns meanings with my understanding.

Columns Description : 
* ID : Customer account number
* Age : Age of the Customers
* Experience : I think that is the working experience
* Income : I think daily income for the customers
* ZIP Code : ZIP Code
* Family : Number of households
* CCAvg : I think that is the averange daily spending from credit card
* Education : eduaction level
* Mortgage : I think that is the monthly mortgage payments
* Personal Loan : Personal loan ownership
* Securities Account : Securities account ownership
* CD account : ?
* Online : Does he or she have an online transaction? ( I am not confident that one)
* CreditCard : Credit card ownership

Before the dive in, I need to exclude some variables that are not necessary to me.

In [None]:
df_v1 = df.drop(["ID", "ZIP Code"], axis = 1)
df_v1.columns

In [None]:
df_v1.info()
df_v1.describe()

**Data Types :**

When I looking the dataset information, I saw that some variables types need to be arrange with the correct forms. For example, Personal loan type should be the boolean. I will make the necessary changing in the dataset.

In [None]:
#Changing variables types
df_v1["Personal Loan"] = df_v1["Personal Loan"].astype("bool")
df_v1["Securities Account"] = df_v1["Securities Account"].astype("bool")
df_v1["CD Account"] = df_v1["CD Account"].astype("bool")
df_v1["Online"] = df_v1["Online"].astype("bool")
df_v1["CreditCard"] = df_v1["CreditCard"].astype("bool")
df_v1["Education"] = df_v1["Education"].astype("category")

In [None]:
df_v1.info()

**Inconsistency :**

In the data description, I saw that there is negative experience in the experience column. That can not be possible. Therefore, I will make 0 in negative values in the experience column.

After making the changes, I think there are some outliers in the columns. However, I will not do anything to solve this problem.

In [None]:
#Make the experience column values change
df_v1.Experience[df_v1.Experience < 0] = 0
df_v1.describe()

**Graphs :** 

The best way to looking the relationship between variables draw their graphs. In this part, I will make same graphs that can explain the relations.

In [None]:
sns.countplot(df_v1.Education, hue = df_v1["Personal Loan"])
plt.show()

In [None]:
sns.heatmap(df_v1[df_v1.describe().columns].corr(), annot = True, )
plt.show()

**Multicollinearity :**

In the heat plot, I saw that most of the variables are not related to each other. However, Income and CCAvg columns are looks like related. Therefore, when we are make a model, we need to select one variables betweend the income and CCAvg

**Creating the Logistice Regression**

Before create a model, I need to explain some concepts. In the essence, logistic regression and linear regression are not different at all. They have same cost functions and they have a almost same gradient descent algorithm. The main difference between them is the sigmoid function.

Lets explain the hypotheses function. It is the h_q(X) = g(Q.T * X) where g is the sigmoid or logistic function. We want to h_q(x) is between 0 and 1. The sigmoid function give us this results.

    Sigmoid Function :
        g(z) = 1 / (1 + e^(-z))
    
    h_q(x) = P(y=1|x;Q) that is probability that y = 1, given X, parameterized by Q.

We explain the main concept. Now, lets look at the cost function.

J(Q) = (1 / m) * sum((h_q(X) - y)^2) where m is size of the dataset

When we are using the gradient descent algorith, we need to take derivative of cost function and, after that, we need to update parameters simultaneously. 

    The simplified version of the algorithm : 
        Q_j = Q_j - alpha * sum((h_q(x) - y) * x)


In [None]:
# Firstly, I take the necessary variables. One is dependent variable that is Personal Loan and second one is the lets say income.

X = df_v1.Income
y = df_v1["Personal Loan"]

# To make the gradient descent algorithm works efficiently, I will scaling the independent variable
x_scale = X / (np.max(X) - np.min(X))
x_scale.describe()

In [None]:
constant = 0
coef_1 = 0
alpha = 0.001
number_iter = 10

costs_list = []
for _ in range(number_iter):
    # firstly, create the linear function
    z = constant + coef_1 * x_scale
    h_q = 1 / (1 + np.exp((-z)))
    
    constant = constant - alpha * sum((h_q - y))
    coef_1 = coef_1 - alpha * sum((h_q - y) * x_scale)
    
    cost = -sum(y * np.log(h_q) + (1-y)*np.log(1-h_q)) / y.size
    
    costs_list.append(cost)

In [None]:
sns.lineplot(range(len(costs_list)), costs_list, marker = "o")
plt.show()

print("\nThe last value of the cost is :{:.3f}".format(costs_list[-1]))

Lets check our results with sklearn library.

In [None]:
#Shaping the values to fit the sklearn Logistic Regression
x_scale_reshape = x_scale.values.reshape(-1,1)
y_reshape = y.values.reshape(-1,1)

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

log_reg.fit(x_scale_reshape, y_reshape)

print("Logistic regression coefficient parameter : {:.2f}".format(log_reg.coef_[0][0]))
print("\nLogistic regression constant parameter : {:.2f}".format(log_reg.intercept_[0]))

h_q = log_reg.predict_proba(x_scale_reshape)[:, 1]


cost = -sum(y * np.log(h_q) + (1-y)*np.log(1-h_q)) / y.size

print("Cost of the sklearn result : {:.3f}".format(cost))