In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy.stats import skew, norm 
from warnings import filterwarnings as filt 

filt('ignore')
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12,6)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
base_dir = '/kaggle/input/loan-prediction-based-on-customer-behavior/'
traindf = pd.read_csv(f'{base_dir}Training Data.csv')
testdf = pd.read_csv(f'{base_dir}Test Data.csv')
traindf.shape, testdf.shape

In [None]:
traindf.head()

In [None]:
testdf.head()

In [None]:
trainId = traindf.shape[0]
target = traindf.Risk_Flag
traindf = traindf.drop(['Risk_Flag','Id'], axis = 1)
testdf = testdf.drop(['ID'], axis = 1)
df = pd.concat([traindf, testdf]).reset_index(drop = True)
df.head()

In [None]:
df.isnull().sum()

In [None]:
uni = pd.DataFrame(df.nunique(), columns = ['unique']).sort_values('unique', ascending = False)
uni['unique %'] = np.round((uni.unique / df.shape[0]) * 100, 2)
uni

In [None]:
counts = [c for c in df.columns if df[c].nunique() <= 5]
fig, ax = plt.subplots(2, 2)
fig.tight_layout()
ind = 0
for r in range(2):
    for c in range(2):
        x = df[counts[ind]]
        sns.countplot(x, ax = ax[r,c])
        ind += 1

### data cleaning

In [None]:
categorical_feats = df.select_dtypes(include = 'object').columns
numerical_feats = df.select_dtypes(exclude = 'object').columns
df[categorical_feats].head()

In [None]:
df.Profession.unique()

In [None]:
df['Profession'] = df.Profession.apply(lambda x : '_'.join(x.split()))

In [None]:
df.CITY.unique().shape

In [None]:
import re
df['CITY'] = df.CITY.apply(lambda x : '_'.join(re.split(r'-| ',x.split('[')[0])))

In [None]:
 df['STATE'] = df.STATE.apply(lambda x : '_'.join(x.split('[')[0].split()))

In [None]:
df['House_Ownership'] = df.House_Ownership.apply(lambda x : 'broke' if x == 'norent_noown' else x)

In [None]:
df = df.rename(columns = {'Married/Single' : 'Single'})
df.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder as oe

dummy = pd.get_dummies(df.House_Ownership, prefix = 'House_Ownership')
house_ownership = df.House_Ownership
df = df.drop(['House_Ownership'], axis = 1)
df = pd.concat([df, dummy], axis = 1)

encoder = oe()
feats = ['Single', 'Car_Ownership', 'Profession', 'CITY', 'STATE']
df[feats] = encoder.fit_transform(df[feats])
df.head()

In [None]:
feats_to_plot = [c for c in df.columns if df[c].nunique() > 5]
fig, ax = plt.subplots(len(feats_to_plot), 2, figsize = (18,13))
fig.tight_layout()
for ind, col in enumerate(feats_to_plot):
    sns.distplot(df[col], ax = ax[ind, 0])
    sns.boxplot(df[col], ax = ax[ind, 1])

### feature engg

In [None]:
def train_val(x, y, test_size = 0.2):
    idx = x.sample(frac = test_size).index
    train_x, val_x = x.drop(idx), x.loc[idx]
    train_y, val_y = y.drop(idx), y.loc[idx]
    return train_x, val_x, train_y, val_y

In [None]:
traindf.shape[0], traindf.shape[0] * 0.05 

In [None]:
x = df.loc[:trainId - 1]
testdf = df.loc[trainId : ].reset_index()
train_x, val_x, train_y, val_y = train_val(x, target, 0.05)
train_x.shape, train_y.shape, val_x.shape, val_y.shape

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
import shap
from pdpbox import pdp
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler

def permImp(x, y):
    model = rfc().fit(x, y)
    perm = PermutationImportance(model).fit(x, y)
    return eli5.show_weights(perm , feature_names = x.columns.tolist())

def isolate(x, y, col):
    model = rfc().fit(x, y)
    pdp_dist = pdp.pdp_isolate(model, dataset = x, model_features = x.columns, feature = col)
    return pdp.pdp_plot(pdp_dist, feature_name = col)

def interact(x, y, col):
    model = rfc().fit(x, y)
    pdp_dist = pdp.pdp_interact(model, dataset = x, model_features = x.columns, features = col)
    return pdp.pdp_interact_plot(pdp_dist, feature_names = col)

def forceplot(x, y, n_class = 0):
    model = rfc().fit(x, y)
    explainer = shap.TreeExplainer(model)
    x = x.sample(n = 1)
    shap_value = explainer.shap_values(x)[n_class]
    expected_value = explainer.expected_value[n_class]
    return shap.force_plot(expected_value, shap_value, feature_names = x.columns)

def plot_mi(score):
    score = score.sort_values('mi_score', ascending = True)
    return plt.barh(score.index, score.mi_score)

def mi_score(x, y):
    x = pd.DataFrame(StandardScaler().fit_transform(x), columns = x.columns)
    score = pd.DataFrame(mutual_info_classif(x, y, discrete_features = False), index = x.columns, columns = ['mi_score'])
    plot_mi(score)
    return score.sort_values('mi_score', ascending = False)

In [None]:
mscore = mi_score(val_x, val_y)

In [None]:
permImp(val_x, val_y)

judging from the multual info and perm importance income, city and profession are the most important features for the model 

In [None]:
plt.figure(figsize = (12,6))
isolate(val_x, val_y, 'Income')

hmmm according to pd plot, greater the income lower the chance of getting a loan 

In [None]:
interact(val_x, val_y, ['Income', 'CITY'])

there's a greater chance of getting a loan from almost every city if the income is low but if its high the chances are low 

In [None]:
shap.initjs()
forceplot(val_x, val_y, 1)

In [None]:
train_x.head()

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRFClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_validate, KFold, train_test_split, GridSearchCV

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler , RobustScaler, MinMaxScaler

In [None]:
#         idx = x.sample(frac = frac).index
#         x = x.loc[idx]
#         y = y.loc[idx]

def best_model(x, y, frac = 0):
    if frac > 0:
        print(f'Taking the sample size of :===> {x.shape[0] * frac}')
        bigger_x, x, bigger_y, y = train_test_split(x, y, test_size = frac, stratify = y)
        print('target variable split %')
        print(y.value_counts()/ y.shape[0])

    xgb.set_config(verbosity=0)
    models = [LogisticRegression(), SVC(), KNeighborsClassifier(), GaussianNB(), rfc(), XGBRFClassifier(), LGBMClassifier()]
    names = ['logistic regg', 'svm', 'knn', 'naive bayes', 'random forest', 'xgboost', 'lightgb']
    scores = [[] for _ in range(4)]
    for model in models:
        for idx, scaler in enumerate([None, StandardScaler(), RobustScaler(), MinMaxScaler()]):
            if scaler:
                model = Pipeline(steps = [('scaler', scaler), ('model', model)])
            #cv = StratifiedKFold(5, shuffle = True, random_state = 123)
            cv = KFold(5, shuffle = True, random_state = 123)
            score = cross_validate(model, X = x, y = y, cv = cv, scoring = 'f1', verbose = 0)['test_score'].mean()
            scores[idx].append(score)
    return pd.DataFrame(scores, columns = names, index = ['None', 'std', 'robust', 'minmax']).T

def clf_report(yt, pred):
    print()
    print(classification_report(yt,  pred))
    print()
    
def get_score(xt, yt, xtest, ytest, model, scaler = None):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    model.fit(xt, yt)
    pred = model.predict(xtest)
    print(' Report '.center(60,'='))
    print()
    print(f"training score  :===>  {model.score(xt, yt)}")
    print(f"testing score   :===>  {model.score(xtest, ytest)}")
    clf_report(ytest, pred)
    sns.heatmap(confusion_matrix(ytest, pred), fmt = '.1f', annot = True)
    
    
def gridcv(xt, yt, model, params, scaler = None, frac = 0):
    if frac > 0:
        print(f'Taking the sample size of :===> {xt.shape[0] * frac}')
        bigger_xt, xt, bigger_yt, yt = train_test_split(xt, yt, test_size = frac, stratify = yt)
        print('target variable split %')
        print(yt.value_counts()/ yt.shape[0])   
        
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    cv = KFold(5, shuffle = True, random_state = 123)
    clf = GridSearchCV(model, param_grid = params, cv = cv, scoring = 'f1_micro', return_train_score = True, verbose = 1)
    clf.fit(xt, yt)
    res = pd.DataFrame(clf.cv_results_).sort_values('mean_test_score', ascending = False)
    return clf.best_estimator_, clf.best_params_, res[['mean_train_score','mean_test_score','params']]
    

In [None]:
sns.countplot(train_y)
plt.legend(train_y.value_counts())

In [None]:
train_y.value_counts()

In [None]:
# since the dataset is large lets try undersampling 
# disadvantage - loss of information
from imblearn.under_sampling import RandomUnderSampler

smot = RandomUnderSampler()
us_x, us_y = smot.fit_resample(train_x, train_y)
us_x.shape, us_y.shape

In [None]:
sns.countplot(us_y)

In [None]:
58940 * 0.4

In [None]:
# score = f1
best_model(us_x, us_y, frac = 0.4)

In [None]:
params = {
    'n_estimators' : [100,200,300], 
    'max_depth' : [8,12,16,20,None],
    'criterion' : ['gini', 'entropy'],
    'bootstrap' : [True, False],
    'class_weight' : [None, 'balanced']
}
pipeline_params = {f"model__{key}" : value for key, value in params.items()}

clf, best_param, results = gridcv(us_x, us_y, rfc(), params, None, 0.04)

In [None]:
sns.lineplot(x = np.arange(0,results.shape[0]), y = results.mean_train_score)
sns.lineplot(x = np.arange(0,results.shape[0]), y = results.mean_test_score)
plt.title('f1 score comparision for train and test')
plt.legend(['training score', 'testing score'])

In [None]:
results.head()

In [None]:
best_param

In [None]:
new_x_train, new_x_test, new_y_train, new_y_test = train_test_split(us_x, us_y, test_size = 0.2, stratify = us_y)

In [None]:
get_score(new_x_train, new_y_train, new_x_test, new_y_test, rfc(max_depth = 16))

In [None]:
new_x_train, new_x_test, new_y_train, new_y_test = train_test_split(us_x.drop(['House_Ownership_broke'], axis = 1), us_y, test_size = 0.2, stratify = us_y)

In [None]:
get_score(new_x_train, new_y_train, new_x_test, new_y_test, rfc(max_depth = 16))

### original datasets

In [None]:
# xt, val_x, yt, val_y = train_val(train_x, train_y)
xt, val_x, yt, val_y = train_test_split(train_x, train_y, test_size = 0.2, stratify = train_y)

In [None]:
get_score(xt, yt, val_x, val_y, rfc(max_depth = 15, class_weight = 'balanced'))

In [None]:
from sklearn.utils import class_weight

class_weight.compute_class_weight('balanced', yt.unique(), yt)

In [None]:
weights = np.linspace(0.0,1.0, 20)
weights

In [None]:
params = {
    'class_weight' : [{0: x, 1: abs(1 - x)} for x in weights] + ['balanced', 'auto', 'None']
}
clf, best_weight, results = gridcv(xt, yt, rfc(max_depth = 15), params, None, 0.04)

In [None]:
sns.lineplot(x = np.arange(0,results.shape[0]), y = results.mean_train_score)
sns.lineplot(x = np.arange(0,results.shape[0]), y = results.mean_test_score)
plt.title('f1 score comparision for train and test')
plt.legend(['training score', 'testing score'])

In [None]:
results.head()

In [None]:
best_weight

In [None]:
get_score(xt, yt, val_x, val_y, clf)