In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline 
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import model_selection
import xgboost as xgb


In [None]:
df_train=pd.read_csv(r'../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv')
df_test= pd.read_csv(r'../input/loan-prediction-problem-dataset/test_Y3wMUE5_7gLdaTN.csv')
                     

In [None]:
df_train.head()

In [None]:
df_train.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
df_train.hist(bins=10, figsize=(20,15))
plt.show()


A bunch of outliers, that needs to be taken care of

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df_train, df_train['Loan_Status']):
    strat_train_set = df_train.loc[train_index]
    strat_test_set = df_train.loc[test_index]

In [None]:
strat_test_set['Loan_Status'].value_counts()/ len(strat_test_set)

In [None]:
df_train['Loan_Status'].value_counts()/len(df_train)

In [None]:
sns.FacetGrid(df_train,hue="Loan_Status",height=5).map(plt.scatter,"ApplicantIncome","LoanAmount").add_legend();
plt.show()

In [None]:
corr=df_train.corr

sns.set(style="white")

corr = df_train.corr()

mask = np.triu(np.ones_like(corr, dtype=np.bool))

f, ax = plt.subplots(figsize=(15, 10))

cmap = sns.diverging_palette(220, 10, as_cmap=True)

plt.title('Correlation Matrix', fontsize=18)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

plt.show()

In [None]:
df_train.columns

In [None]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',]
scatter_matrix(df_train[attributes], figsize=(10, 6))


In [None]:
strat_train_set.isnull().sum()

In [None]:
strat_train_set.dtypes

In [None]:
X_train=strat_train_set.drop("Loan_Status",axis=1)
y_train=strat_train_set['Loan_Status'].copy()

In [None]:
df_train['Loan_Amount_Term'].value_counts()

In [None]:
X_train['Loan_Amount_Term_Cat'] = pd.cut(x=X_train['Loan_Amount_Term'], bins=[6,119,239,359,480])

In [None]:
X_train['Loan_Amount_Term_Cat'].value_counts()

In [None]:
X_train.head()

In [None]:
X_train.columns

In [None]:
X_train_Num = X_train[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']]
X_train_Cat = X_train[['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed','Credit_History', 'Property_Area',
       'Loan_Amount_Term_Cat']]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X_train_Num_tr = num_pipeline.fit_transform(X_train_Num)

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder(sparse=False)

Cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('cat_encoder', OneHotEncoder(sparse=False)),
     ])
X_train_cat_tr = Cat_pipeline.fit_transform(X_train_Cat)

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_train_Num)
cat_attribs = list(X_train_Cat)
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", Cat_pipeline, cat_attribs),
    ])

X_train_prepared = full_pipeline.fit_transform(X_train)

In [None]:
X_train_prepared.shape

In [None]:
y_train.replace(["Y","N"],[1,0],inplace=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train_prepared, y_train)


In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
p1=forest_clf.predict(X_train_prepared)
print(confusion_matrix(y_train,p1))
print("Accuracy Score {}".format(accuracy_score(y_train,p1)))
print("Classification report: {}".format(classification_report(y_train,p1)))

## Preparing Test Data

In [None]:
X_test=strat_test_set.drop("Loan_Status",axis=1)
y_test=strat_test_set['Loan_Status'].copy()

In [None]:
X_test['Loan_Amount_Term_Cat'] = pd.cut(x=X_test['Loan_Amount_Term'], bins=[6,119,239,359,480])

In [None]:
X_test_Num = X_test[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']]
X_test_Cat = X_test[['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed','Credit_History', 'Property_Area',
       'Loan_Amount_Term_Cat']]

In [None]:
num_pipeline2 = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X_test_Num_tr = num_pipeline2.fit_transform(X_test_Num)

In [None]:
Cat_pipeline2 = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('cat_encoder', OneHotEncoder(sparse=False)),
     ])
X_test_cat_tr = Cat_pipeline2.fit_transform(X_test_Cat)

In [None]:
num_attribs2 = list(X_test_Num)
cat_attribs2 = list(X_test_Cat)
full_pipeline = ColumnTransformer([
        ("num", num_pipeline2, num_attribs2),
        ("cat", Cat_pipeline2, cat_attribs2),
    ])

X_test_prepared = full_pipeline.fit_transform(X_test)

In [None]:
X_test_prepared.shape

In [None]:
X_test.shape

In [None]:
y_test.replace(["Y","N"],[1,0],inplace=True)

_____________________________________________________________________________________________

In [None]:
p2=forest_clf.predict(X_test_prepared)
print(confusion_matrix(y_test,p2))
print("Accuracy Score {}".format(accuracy_score(y_test,p2)))
print("Classification report: {}".format(classification_report(y_test,p2)))

## Randomized SearchCV

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

In [None]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train_prepared,y_train)

In [None]:
rf_randomcv.best_params_

In [None]:
rf_randomcv.best_estimator_

In [None]:
best_random_grid=rf_randomcv.best_estimator_

In [None]:
p3=best_random_grid.predict(X_test_prepared)
print(confusion_matrix(y_test,p3))
print("Accuracy Score {}".format(accuracy_score(y_test,p3)))
print("Classification report: {}".format(classification_report(y_test,p3)))

#### we see an increase of 5% in the accuracy! lets take a range of best params and fetch it in GridSearch CV and lets see if it helps

In [None]:
rf_randomcv.best_params_

## GridSearch CV

In [None]:
## Defining  a range around the best parameters from the Random Search CV
from sklearn.model_selection import GridSearchCV

param_grid1 = {
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth': [rf_randomcv.best_params_['max_depth']],
    'max_features': [rf_randomcv.best_params_['max_features']],
    'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
                         rf_randomcv.best_params_['min_samples_leaf']+2, 
                         rf_randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 2,
                          rf_randomcv.best_params_['min_samples_split'] - 1,
                          rf_randomcv.best_params_['min_samples_split'], 
                          rf_randomcv.best_params_['min_samples_split'] +1,
                          rf_randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100, 
                     rf_randomcv.best_params_['n_estimators'], 
                     rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 200]
}

print(param_grid1)

In [None]:
rfc=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rfc,param_grid=param_grid1,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train_prepared,y_train)

In [None]:
best_grid=grid_search.best_estimator_

In [None]:
p4=best_grid.predict(X_test_prepared)
print(confusion_matrix(y_test,p4))
print("Accuracy Score {}".format(accuracy_score(y_test,p4)))
print("Classification report: {}".format(classification_report(y_test,p4)))

In [None]:
import pickle
# open a file, where you ant to store the data
file = open('RFbestGridLat.pkl', 'wb')

# dump information to that file
pickle.dump(best_grid, file)

 ## Bayesian Optimization HyperOpt

In [None]:

from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

{'n_estimators': 1400,
 'min_samples_split': 5,
 'min_samples_leaf': 6,
 'max_features': 'sqrt',
 'max_depth': 120,
 'criterion': 'gini'}

In [None]:
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [None]:
space

In [None]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, X_train_prepared, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

In [None]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

In [None]:
best['min_samples_leaf']

In [None]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train_prepared,y_train)
predictionforest = trainedforest.predict(X_test_prepared)
print(confusion_matrix(y_test,predictionforest))
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

### Genetic Algorithms
Genetic Algorithms tries to apply natural selection mechanisms to Machine Learning contexts.

Let's immagine we create a population of N Machine Learning models with some predifined Hyperparameters. We can then calculate the accuracy of each model and decide to keep just half of the models (the ones that performs best). We can now generate some offsprings having similar Hyperparameters to the ones of the best models so that go get again a population of N models. At this point we can again calculate the accuracy of each model and repeate the cycle for a defined number of generations. In this way, just the best models will survive at the end of the process.

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

In [None]:
from tpot import TPOTClassifier

In [None]:
tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_train_prepared,y_train)

In [None]:
accuracy = tpot_classifier.score(X_test_prepared, y_test)
print(accuracy)

### Best Accuracy ~ 86%