#### Import essential libaries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

#### Read data from csv

In [None]:
df = pd.read_csv('../input/bank-personal-loan-modelling/Bank_Personal_Loan_Modelling.csv')

#### Check of occurance of null

In [None]:
df.info()
df.isnull().sum()

In [None]:
#taking random sample from data to take a look at data
df.sample(10)

#### Check distribution of data

In [None]:
df.groupby(['Personal Loan']).size()
#df.apply(lambda x: print('\nColumn {}, value {}'.format(x.name,x.unique())))
# Only few people around 9.6% accepted the personal loan in the past year.

In [None]:
sns.countplot(df['Personal Loan'])

In [None]:
df.describe().transpose()

In [None]:
# Converting Negative values to postive in experience column.
df['Experience'] = df['Experience'].abs()

#### Data analyis and insights
1. Age groups has approx same size.
2. Experience feature has -3 as minimum values which is invalid.
3. Maximun people have income in range betweeen $98 to $234. Q4 is the biggest quntile.
4. There is high standard deviation for income feature.
5. Very low expenditure from credits is clealry visible.
6. Education is a catageorical variable
7. Mortgage is highly right skewed.
8. Almost 60% user use online banking.
9. Almot 30% user use credit card from Universal bank.
10. Very low number of CD account holders.


In [None]:
#analyizing correlation between data
df.corr().transpose()

#### Analysis from correlations
1. Usage of credit card is high postivelycorrelated to Income of person.(64%)
2. If a person will take perosnal loan or not is also high postively correlated to Income of person(50%), credit card expenditure per month(37%) and wether having a CD account with bank or not(32%).
3. We can remove ZIP Code and ID.
4. Bigger families tends have lower expenditure from credit card and lower income.
5. Certificate of deposite accounts tends to increase as if person has Securities account with the bank.
6. Having a credit card of universal bank in releated to if person has CD account with Thara Bank.
7. We can remove ID, Age,Experience, ZIP Code, Family, Security Account, Online, Credit Card columns as these have very low correlations.  


#### Visualizing data using seaborn

In [None]:
df.drop(['ID','ZIP Code'], axis=1, inplace=True)
sns.pairplot(df, corner=True)

In [None]:
fig, axes = plt.subplots(2,2, figsize=(14,7))
sns.countplot(df['Family'], ax=axes[0,0])
sns.countplot(df['Education'], ax=axes[0,1])
sns.countplot(df['Securities Account'], ax=axes[1,0])
sns.countplot(df['CD Account'], ax=axes[1,1])
fig.tight_layout()

In [None]:
fig, axes = plt.subplots(2,2, figsize=(14,7))
sns.distplot(df['Income'], ax=axes[0,0])
sns.distplot(df['Mortgage'], ax=axes[0,1])
sns.distplot(df['Age'], ax=axes[1,0])
sns.distplot(df['Experience'], ax=axes[1,1])
fig.tight_layout()

#### Split data in test and training

In [None]:

y = df['Personal Loan']
X = df.drop(['Personal Loan'], axis=1).copy()
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.3, random_state=1)
#X_train.sample(2)
#X_valid.sample(2)
#y_train.sample(2)
#y_valid.sample(2)

#### Scaling Data

In [None]:

scaler = MinMaxScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_valid = scaler.fit_transform(X_valid)

## Modeling Section

## Helper Fucntions

In [None]:
def confusion_heatmap_metrics(predictions):
    df_table = confusion_matrix(y_valid,predictions, [1,0])
    sns.heatmap(df_table, annot=True, fmt='0.2f', xticklabels=['Accept', 'Reject'], yticklabels=['Accept', 'Reject'])
    plt.ylabel('Predicted Values')
    plt.xlabel('Actual Values')
    plt.show()
    all_metrics(df_table)
    
def all_metrics(df_table):
    TP = df_table[0,0]
    FN = df_table[1,0]
    FP = df_table[0,1]
    TN = df_table[1,1]
    accuracy = (TP+TN)/(TP+FN+FP+TN)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = (2 * accuracy * precision)/ (accuracy + precision)
    print('Accuracy: {}, Precision: {}, Recall: {}, F1: {}'.format(accuracy,precision,recall,f1))
    print('\n')

### Random Forest

In [None]:
#Model: RandomForest

rf_model = RandomForestRegressor()
rf_model.fit(X_train,y_train)
predictions = rf_model.predict(X_valid)
mae = mean_absolute_error(predictions,y_valid)
rf_model.score(X_valid,y_valid)
#values in predictions are coming as float64, converting them back to int
predictions = predictions.astype(int)
confusion_heatmap_metrics(predictions)

### Logistic Regression

In [None]:
# Model: Logistic Regression

lr_model = LogisticRegression(max_iter=5000)
lr_model.fit(X_train,y_train)
lr_predicts = lr_model.predict(X_valid)
print('Logistic Regression score with training data :{} '.format(lr_model.score(X_train,y_train)))
print('Logistic Regression score with test data :{} '.format(lr_model.score(X_valid,y_valid)))
confusion_heatmap_metrics(lr_predicts)
#for scaled data
sc_lr_model = LogisticRegression(max_iter=5000)
sc_lr_model.fit(scaled_X_train,y_train)
sc_lr_predicts = lr_model.predict(scaled_X_valid)
print('Logistic Regression score with scaled training data :{} '.format(lr_model.score(scaled_X_train,y_train)))
print('Logistic Regression score with scaled test data :{} '.format(lr_model.score(scaled_X_valid,y_valid)))

### KNN

In [None]:
# Model: KNN
MSE = []
neighbors = []
def knn_model(n):
    KNN_model = KNeighborsClassifier(n_neighbors=n)
    KNN_model.fit(X_train,y_train)
    score_train = KNN_model.score(X_train,y_train)
    score_test = KNN_model.score(X_valid,y_valid)
    #print('\nKNeighborsClassifier(n_neighbors: {}) score with training data :{} '.format(n,score_train))
    #print('KNeighborsClassifier(n_neighbors: {}) with test data :{} '.format(n,score_test))
    return score_test

for n in np.arange(1,30):
    if n%2 !=0:
        neighbors.append(n)
        test_score_val = knn_model(n)
        MSE.append(1-test_score_val)
    

In [None]:
#plot misclassification error
best_neighbors_val = neighbors[MSE.index(min(MSE))]
plt.plot(neighbors,MSE)
print('Best Neighbor is', best_neighbors_val)

In [None]:
best_KNN_model = KNeighborsClassifier(n_neighbors=best_neighbors_val)
best_KNN_model.fit(X_train,y_train)
predictions = best_KNN_model.predict(X_valid)
print('KNN modle score: ', best_KNN_model.score(X_valid,y_valid))
confusion_heatmap_metrics(predictions)

### Navie Bayes

In [None]:
# Model: Multinomial Naive Bayes

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train,y_train)
nb_predictions = nb_model.predict(X_valid)
train_score = nb_model.score(X_train,y_train)
test_score = nb_model.score(X_valid,y_valid)
print('MultinomialNB score with training data :{} '.format(train_score))
print('MultinomialNB score with test data :{} '.format(test_score))
confusion_heatmap_metrics(nb_predictions)

 #### Conlusion

1. Which model performes better?
    The best model in the case of this dataset(personal lona classifier) Logistic Regression perfomes better than other given model in question.
2. Why this model performes better?
    Logistic REgression is mostly used for classification problem in which the outcome is binary.