In [None]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
plt.style.use(['fivethirtyeight'])

%matplotlib inline

#EDA
#pip install -U pandas-profiling[notebook]
from pandas_profiling import ProfileReport
# how to use it
#profile = ProfileReport(df, title='Pandas Profiling Report')


#default theme
sns.set(context='notebook', style='darkgrid', palette='deep', font='sans-serif', font_scale=1, color_codes=False, rc=None)

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

#Cross-validation
from sklearn.model_selection import StratifiedKFold, KFold

#Ensembling
from sklearn.ensemble import VotingClassifier
from vecstack import StackingTransformer
from vecstack import stacking

# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform


# <span style="color:green"> Objective: </span>
### <span style="color:green"> Predict which of the customers will have their loan approved. </span>

### Let's get the data

In [None]:
train = pd.read_csv(r"../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv")
test = pd.read_csv(r"../input/loan-prediction-problem-dataset/test_Y3wMUE5_7gLdaTN.csv")

### Briefly check the data

In [None]:
print("train:", train.size, "\ntest:", test.size)

In [None]:
train.info()

In [None]:
test.info()

#### We have up to 7! object type features. We will have to deal with them later.

In [None]:
train.head(10)

In [None]:
test.head(10)

In [None]:
train.describe(include="all")

In [None]:
test.describe(include= "all")

## (quick) Data Visualization

In [None]:
#check the EDA part where all the packages are
profile = ProfileReport(train, title = "Train data")
profile

The great thing about this tool is that is quickly lets you see all the analysis we did before.

It presents it to you in an simple and easy to read way. Of course, it gives you an overall report, not a specific relationship you can find by making your own graphs. 

In [None]:
#Nan values
train.isna().sum()

In [None]:
test.isna().sum()

#### Okay! there are some Nan values and we have the object type.  That just means one thing:

## Data cleaning!

### Strings

From the .head function, We have four string columns: Gender, Married,Education and Property area. Let's check how they are.

In [None]:
print(f'TRAIN DATA \nGender: \n{train["Gender"].value_counts()},\nMarried: {train["Married"].value_counts()},\nEducation: {train["Education"].value_counts()}, \nProperty: {train["Property_Area"].value_counts()}')

In [None]:
print(f'TEST DATA \nGender: \n{test["Gender"].value_counts()},\nMarried: {test["Married"].value_counts()},\nEducation: {test["Education"].value_counts()}, \nProperty: {test["Property_Area"].value_counts()}')

We will deal with them with the excellent code from Yonatan Rabinovich, on his "Loan Prediction Dataset ML Project" notebook.


check it here: https://www.kaggle.com/yonatanrabinovich/loan-prediction-dataset-ml-project

We could use panda's pd.get_dummies to create dummies out of the categorical values. We may make a comparison between that method and assigning a number to the categories.

In [None]:
#converting categorical values to numbers

to_numeric = {'Male': 1, 'Female': 2,
'Yes': 1, 'No': 2,
'Graduate': 1, 'Not Graduate': 2,
'Urban': 3, 'Semiurban': 2,'Rural': 1,
'Y': 1, 'N': 0,
'3+': 3}

# adding the new numeric values from the to_numeric variable to both datasets
train = train.applymap(lambda lable: to_numeric.get(lable) if lable in to_numeric else lable)
test = test.applymap(lambda lable: to_numeric.get(lable) if lable in to_numeric else lable)

# convertind the Dependents column
Dependents_ = pd.to_numeric(train.Dependents)
Dependents__ = pd.to_numeric(test.Dependents)

# dropping the previous Dependents column
train.drop(['Dependents'], axis = 1, inplace = True)
test.drop(['Dependents'], axis = 1, inplace = True)

# concatination of the new Dependents column with both datasets
train = pd.concat([train, Dependents_], axis = 1)
test = pd.concat([test, Dependents__], axis = 1)

# checking the our manipulated dataset for validation
print(f"training set (row, col): {train.shape}\n\ntesting set (row, col): {test.shape}\n")
print(train.info(), "\n\n", test.info())

### Nan

For this, we can: 

1. Get rid of the corresponding nan values.
2. Get rid of the whole feature.
3. Set the values to some value (zero, the mean, the median, etc.).

Remember:
    
    Theoretically, 25 to 30% is the maximum missing values are allowed, beyond which we might want to drop the variable from analysis. 
    
 in this case is no problem, but is a nice reminder

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
#let's imput with the median for numeric features using the median
for_numeric = SimpleImputer(strategy= 'median')
a = for_numeric.fit_transform(train[["LoanAmount", "Loan_Amount_Term", "Credit_History"]])
a = pd.DataFrame(a,columns= ["LoanAmount", "Loan_Amount_Term", "Credit_History"])
train[["LoanAmount", "Loan_Amount_Term", "Credit_History"]] = a

In [None]:
train.isna().sum()

In [None]:
#Now let's imput the object type features with most_frequent categorical varaibles
for_object = SimpleImputer(strategy= "most_frequent")
b = for_object.fit_transform(train[["Gender", 'Married', "Dependents", "Self_Employed"]])
b = pd.DataFrame(b, columns= ["Gender", 'Married', "Dependents", "Self_Employed"])
train[["Gender", 'Married', "Dependents", "Self_Employed"]] = b

In [None]:
train.isna().sum()

Now, the same but with the TEST dataset

In [None]:
for_numeric = SimpleImputer(strategy= 'median')
a = for_numeric.fit_transform(test[["LoanAmount", "Loan_Amount_Term", "Credit_History"]])
a = pd.DataFrame(a,columns= ["LoanAmount", "Loan_Amount_Term", "Credit_History"])
test[["LoanAmount", "Loan_Amount_Term", "Credit_History"]] = a

In [None]:
for_object = SimpleImputer(strategy= "most_frequent")
b = for_object.fit_transform(test[["Gender", 'Married', "Dependents", "Self_Employed"]])
b = pd.DataFrame(b, columns= ["Gender", 'Married', "Dependents", "Self_Employed"])
test[["Gender", 'Married', "Dependents", "Self_Employed"]] = b

In [None]:
test.isna().sum()

In [None]:
#Let's quickly drop "Loan_ID" since we don't need it
train = train.drop("Loan_ID", axis= 1)
train

In [None]:
test = test.drop("Loan_ID", axis = 1)
test

#### okay, we are set, we can go to the 
## Models!

In [None]:
#let's divide in X and y. Since we are going to predict the "Loan_status", let's take it out
X = train.drop(["Loan_Status"], axis = 1)
X

In [None]:
y = train[["Loan_Status"]]
y

In [None]:
#Divide the train data set into train and test to teach and test the models
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size =0.03, random_state = 42)

Let's try many different models first

In [None]:
#We will use 6 different models with random sets of hyperparameters 
#and will pass all of them in a dictionary that will be inside a lsit
clfs = [
    (
        KNeighborsClassifier(n_jobs= -1),
        {'n_neighbors' : [5, 6, 7],
         'weights': ['uniform', 'distance']
        }
    ),
    (
      XGBClassifier(n_jobs= -1, random_state= 42),
        {'learning_rate' : [0.002, 0.001, 0.01],
         'max_depth' : [5, 10, 15, 20],
         'n_estimators' : [7000, 6500, 7500],
         'reg_alpha' : [0.9, 0.8, 1],
         'reg_lambda' : [0.9, 0.8, 1],
         'subsample' : [0.9, 0.8, 1],
         'metric_period' : [50, 100, 50]       
        }
    ),
    (
        LGBMClassifier(n_jobs = -1, random_state = 42),
        {'learning_rate': [0.002, 0.0045, 0.02],
         'num_iteration': [10000, 9000, 11000],
         'n_estimators' : [50, 100, 150, 200],
         'boosting_type' : ['gbdt', 'dart', 'goss'],
         'lambda_l1': [4.6, 5, 6],
         'lambda_l2': [1.9, 2, 3],
         'num_leaves' : [50, 102, 150],
         'min_child_samples' : [10, 20, 30]
         }
    ),
# (
#         LogisticRegression(random_state=0),  #I avoided calculating these models for time reasons. 
#         {'C': np.arange(0.1, 1.1, 0.1),      
#          'penalty': ['l1','l2']}
#     ),
#     (
#         RandomForestClassifier(random_state=0),
#         {'n_estimators': [100,200,300],
#          'max_depth': [3,4,5],
#          'max_features': (np.arange(0.5, 1.0, 0.1))}
#     ),
    (
        MLPClassifier(random_state= 42),
        {'hidden_layer_sizes' : [50, 100, 200],
         'activation' : ['identity', 'logistic', 'tanh', 'relu'],
         'solver': ['lbfgs', 'sgd', 'adam'],
         'alpha' : [0.002, 0.0001, 0.01],
         'learning_rate' : ['constant', 'invscaling', 'adaptive'],
         'learning_rate_init' : [0.002, 0.005, 0.01, 0.1],
         'max_iter' : [100, 500, 1000],
         'momentum' : [0.7, 0.64, 0.8, 0.9]  
        }  
    )
]

In [None]:
#Let's use stratified cross validation for improving our score.
stra = StratifiedKFold(n_splits= 5, random_state= 42)

In [None]:
clfs_tuned = []  
for clf, param_grid in tqdm(clfs):
    start = time.time()
    iterations =  1 if clfs in ['MLPClassifier'] else 1 #MLP takes more time to compute, so let's reduce the amount of iterations for that model
    rand_search = RandomizedSearchCV(clf, param_grid,  n_iter= iterations, random_state=42,
                                     scoring='roc_auc', return_train_score= True,
                                     cv= stra, n_jobs=-1)
    rand_search.fit(x_train, y_train)
    clf_name = type(clf).__name__
    clf_score = rand_search.score(x_test, y_test)
    print('{:30s} {:30f} {:.1f}'.format(clf_name, clf_score, time.time() - start))
    clfs_tuned.append((clf_name, rand_search.best_params_, clf_score)) #storing the name of the model, 
                                                                        #best hyperparameters and score

In [None]:
#because we are going to get a table to check the best results, let's make in a way so we see all the content of the table, 
#by setting the max display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
pd.DataFrame(clfs_tuned)

    Now that we now the best models we can construct and fit the best model
    

In [None]:
best_model = XGBClassifier(subsample =  1, reg_lambda = 0.8, reg_alpha = 1, n_estimators= 6500, 
                           metric_period= 100, max_depth= 20, learning_rate= 0.002)
best_model.fit(x_train, y_train)
prediction = best_model.predict(x_test)

#### Again, I found the kernel from Rabinovich quite useful with this little piece of code that I didn't know of:

In [None]:
print(classification_report(y_test, prediction))
XGB_report = accuracy_score(prediction, y_test)
print(f"{round(XGB_report*100,2)}% Accurate")

#### Thanks for this small piece of code to get a confusion matrix and accuracy so easily!

## Check Feature importance

In [None]:
clf = DecisionTreeClassifier(max_depth= 5, random_state= 42)
clf.fit(x_train, y_train).score(x_test, y_test)

In [None]:
x = range(1,10)
y1 = [DecisionTreeClassifier(max_depth=i, random_state=42).fit(x_train, y_train).score(x_train, y_train) for i in x]
y2 = [DecisionTreeClassifier(max_depth=i, random_state=42).fit(x_train, y_train).score(x_test, y_test) for i in x]
plt.plot(x,y1,label='train')
plt.plot(x,y2,label='test')
plt.xlabel('depth of tree')
plt.ylabel('accuracy')
plt.legend()
plt.show()

### Now,let's use the .feature_importances_ method

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=clf.feature_importances_, y= train.columns[1:])

## It appears we are doing some over-fitting with our model. There are many ways to correct this:

### 1.Cross-validation (did it) ✔
### 2.Train with more data (That is true. Our dataset doesn't have that much data) ✔
### 3.Remove features by checking relevance. (Done it. We observed that Property area seems to be the most relevant feature)  
### 4.Early Stopping (This could be done with XGboost with the parameter early_stopping_rounds= #)
### 5.Regularization (tunning hyperparameters. Done it with Randomized Search CV)
### 6.Ensembling (Will update this later)

### For now, we have an almost 90% accuracy model. 

## Export

In [None]:
output = pd.DataFrame.from_dict([{'y_test':y_test, 'prediction': prediction}]) #due to the nature of y_test we pass it as a list inside dict
output.to_csv('prediction.csv', index=False)

# <span style="color: green"> NOTE: </span>
    
I will updtate the notebook to give a more complete analysis of the data base. 

For now, I hope you learned to do some easy Exploratory Data Analysis and Randomized Search with multiple models at once. 

If you found the notebook useful, please upvote.

And if you have any question or found some errors, let me know! Thanks for reading!!