### Objective: choose the best algorithm out of Dtree, Logistic Regression & SVM with right hyperparameters for classifying credit risk customers as loan accepted and rejected

In [None]:
#for loading & visualization
import pandas as pd
import plotly.express as pe

#for preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

#for dimensionality reduction
from sklearn.decomposition import PCA

#for model algorithms
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#for hyperopt functions
from hyperopt import hp,tpe,fmin,Trials,STATUS_OK,space_eval

#for stopping hyperopt early
from hyperopt.early_stop import no_progress_loss

#for metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score


: 

In [None]:
path = r"C:\Users\harsh\Desktop\NPCI-Python-ML\datasets\Balanced_credit_Risk.txt"
df = pd.read_csv(path)
df

: 

## step 2: Data exploration & preprocessing

In [None]:
df.drop(columns=['Unnamed: 0', 'index']).corr().loc[['loan_status']]

: 

In [None]:
for col in["person_age","person_income","person_emp_length","loan_amnt","loan_int_rate","loan_percent_income","cb_person_cred_hist_length"]:
    display(  pe.strip(y=col, x='loan_status', data_frame=df )   )

: 

In [None]:
for col in ["person_home_ownership","loan_intent","loan_grade","cb_person_default_on_file"]:
    display(    pd.crosstab(  index=df[col], columns=df['loan_status']   )         )

: 

In [None]:
pe.scatter_matrix(
    data_frame=df.drop(columns=['Unnamed: 0', 'index']),
    color='loan_status',
    height=1900,
    width = 1800
)

: 

In [None]:
print(df.shape, df.columns, df.index, sep="\n")

: 

Conclusion: 200 rows and 6 columns with row numbers set as index

In [None]:
display(df.info())

display(df.isna().sum())

: 

Conclusion: No missing data in the dataset

In [None]:
display(df.nunique())

: 

Age & Na_to_K are real-value columns

Drug is the target (categorical)

BP, Cholesterol & Sex are categorical feature columns

### Visualizing relation between feature & target

#  step 2b) Preprocess the data

In [None]:
categorical_features=["person_home_ownership","loan_intent","loan_grade","cb_person_default_on_file"]

real_value_features=["person_age","person_income","person_emp_length","loan_amnt","loan_int_rate","loan_percent_income","cb_person_cred_hist_length"]

: 

### Scaling of features

In [None]:
df.columns

: 

In [None]:
sc = StandardScaler()

for col in real_value_features:
    df[[col]] = sc.fit_transform(df[[col]])

display(df[real_value_features].describe())

fig = pe.box(y=real_value_features, data_frame=df)
display(   fig    )

: 

### categorical columns encoding

In [None]:
le = LabelEncoder()

for col in categorical_features:
    df[col] = le.fit_transform(df[col])

df[categorical_features]

: 

In [None]:
model = PCA(n_components=5)

ans = model.fit_transform(df[real_value_features])

result = pd.DataFrame(ans, columns=['PCA1', "PCA2", "PCA3", "PCA4", "PCA5"])

print(model.explained_variance_ratio_)

: 

In [None]:
df.loan_status.value_counts(normalize=True)

: 

### creation of search space

## objective: to create 2 sets of parameters for 2 different algorithms.
            Hyperopt can only select one set at a time.
            Depending on whether we have set of SVM or set of logit model, operations also differ

steps: 
    a) Create separate dictionaries with parameters and their available options
    b) add a model_type key inside the dictionary to label the dictionary
    c) put all the dictionaries into a list/tuple to specify that this is a collection of parameter sets!
    d) since we need to choice one entry from list/tuple created in step c, use hp.choice on this list

Search Space is made up of many smaller search space.

Each smaller space is corresponding to one type of algorithm

space for dtree
{}
space for logistic regression
{}
space random
{}
space svm
{}

choosing the right algorithm is a also a search space decision

choose an algorithm using hp.choice from a list of [{}, {},{},{},{}]

In [None]:
#keys of this dictionary are names of parameters
space =hp.choice( 'algorithm',
    [
        {
        'model_type' : "SVM",
        'C' : hp.uniform(  'alpha parameter_SVM', 0, 10  ), #choose some number between 0 to 3
        'kernel' : hp.choice('penalty type name_SVM', ['rbf', 'poly', "sigmoid"]), #adding options to choose from for kernel parameter
        'degree' : hp.choice('degree', [1,2,3,4]),
        "gamma" : hp.choice("gamma value", ["scale", "auto"]),
        },

        {
        'model_type' : "logit",
        'C' : hp.uniform(  'alpha parameter_logit', 0, 3  ), #choose some number between 0 to 3
        'penalty' : hp.choice('penalty type name_logit', ['l1', 'l2']), #adding 2 options to choose from for penalty parameter
        'solver' : hp.choice('solver', ['liblinear']) #choose one of the  solver algorithms
        },

        {
            'model_type' : "dtree",
            'criterion' : hp.choice('criteria_tree', ['gini', 'entropy', 'log_loss']),
            'splitter' : hp.choice('splitter_tree', ['best', 'random']),
            'max_depth' : hp.choice('depth_tree', [x for x in range(1,12,1)])
        },

        {
            'model_type' : "random_forest",
            'criterion' : hp.choice('criteria_forest', ['gini', 'entropy', 'log_loss']),
            'max_depth' : hp.choice('depth_forest', [x for x in range(1,12,1)]),
            'n_estimators': hp.choice('estimator_count_forest', [x for x in range(5,125,20)]), 
            'max_features' : hp.choice('feature_count_forest', ['sqrt', 'log2'])
        }
       
    ]
)


: 

### steps

a) Accept the entire search space as a parameter. One algorithm will be passed to the model at a time from this space

b) find out which algorithm is being picked in the current iteration by reading the model_type entry of the parameter set.

c) Since model_type is not a parameter for any ML algorithm class in sklearn, delete it before passing the parameter dictionary to your model

d) use a if condition check to run code according to selected algorithm

{

    "max_depth" : hp.choice( [3,4,5])
}

In [None]:
scores=[]
def objective(space):
    
    algo = space['model_type'] #step b
    del space['model_type']

    if  algo == "logit":
        features = real_value_features + categorical_features
        target = 'loan_status'

        X_train, X_test, y_train, y_test = train_test_split(  
        df[features],df[target],
        test_size=0.4,
        random_state=10,
        stratify = df[target]
        )

        model = LogisticRegression(**space)

        model.fit(X_train, y_train)

        pred = model.predict(X_test)

        accuracy = f1_score(y_test,pred, average='weighted')
        scores.append(accuracy)
        return {'loss': -accuracy,'status':STATUS_OK, "algo": algo}

    if algo == "SVM":
        features = real_value_features
        target = 'loan_status'

        X_train, X_test, y_train, y_test = train_test_split(  
        df[features],df[target],
        test_size=0.4,
        random_state=10,
        stratify = df[target]
        )
        model = SVC(**space)

        model.fit(X_train, y_train)

        pred = model.predict(X_test)

        accuracy = f1_score(y_test,pred, average='weighted')


        scores.append(accuracy)
        return {'loss': -accuracy,'status':STATUS_OK, "algo": algo}

    if  algo == "dtree":
        features = real_value_features + categorical_features
        target = 'loan_status'

        X_train, X_test, y_train, y_test = train_test_split(  
        df[features],df[target],
        test_size=0.4,
        random_state=10,
        stratify = df[target]
        )

        model = DecisionTreeClassifier(**space)

        model.fit(X_train, y_train)

        pred = model.predict(X_test)

        accuracy = f1_score(y_test,pred, average='weighted')
        scores.append(accuracy)
        return {'loss': -accuracy,'status':STATUS_OK, "algo": algo}

    if  algo == "random_forest":
        features = real_value_features + categorical_features
        target = 'loan_status'

        X_train, X_test, y_train, y_test = train_test_split(  
        df[features],df[target],
        test_size=0.4,
        random_state=10,
        stratify = df[target]
        )

        model = RandomForestClassifier(**space)

        model.fit(X_train, y_train)

        pred = model.predict(X_test)

        accuracy = f1_score(y_test,pred, average='weighted')
        scores.append(accuracy)
        return {'loss': -accuracy,'status':STATUS_OK, "algo": algo}
    

: 

In [None]:
trials = Trials() #create a database (in-memory)

      #fmin is the function to be used for minimum optimization
best = fmin(fn=objective, #function
            space=space, #search space parameters
            algo=tpe.suggest,
            early_stop_fn=no_progress_loss(  iteration_stop_count=100, percent_increase=0.1 ), 
            max_evals=1000,
            trials=trials)


print (space_eval(space, best))

: 