### note: To run the code, a library called hyperopt must be installed

###  pip install hyperopt

### Use cases for HyperOpt

### 1) Hyper parameters optimization!
### 2) Creating a pipeline of execution of ML models

In [37]:
import pandas as pd
import plotly.express as pe
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,f1_score,roc_auc_score,accuracy_score
from hyperopt import hp,tpe,fmin,Trials,STATUS_OK,space_eval
from hyperopt.early_stop import no_progress_loss
import numpy as np


## Step 1 : Gather the data

In [38]:
path = r"C:\Users\harsh\Desktop\OptimizingMLModelsOFSS\datasets\Loan_Status_Classification.csv"
loan_df = pd.read_csv(path)
loan_df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,1,0,0,6608,0,137,180,1,1,1
1,0,1,2,0,0,4226,1040,110,360,1,1,1
2,1,1,0,1,0,3167,2283,154,360,1,2,1
3,0,0,0,1,1,6950,0,175,180,1,2,1
4,0,1,0,1,0,3993,3274,207,360,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
293,1,0,0,1,0,3846,0,111,360,1,2,1
294,0,0,0,1,0,2435,0,75,360,1,1,0
295,0,0,2,1,0,4923,0,166,360,0,2,1
296,0,1,3,0,0,2071,754,94,480,1,2,1


## step 2 : Exploratory Data Analysis (EDA)

In [39]:
loan_df.shape

(298, 12)

In [40]:
loan_df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [41]:
loan_df.nunique()

Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      257
CoapplicantIncome    150
LoanAmount           145
Loan_Amount_Term       9
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [42]:
real_value_columns_loan_df = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

categorical_columns_loan_df = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed','Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']

### mark the categorical columns by setting their data type

In [43]:
loan_df[   categorical_columns_loan_df ] =  loan_df[   categorical_columns_loan_df  ].astype('str')

In [44]:
loan_df[real_value_columns_loan_df].describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
count,298.0,298.0,298.0
mean,5351.265101,1673.026846,143.560403
std,6306.080712,2892.404818,80.395182
min,150.0,0.0,9.0
25%,2883.75,0.0,99.25
50%,3854.0,1106.0,125.5
75%,5721.5,2281.0,171.5
max,81000.0,33837.0,600.0


In [45]:
loan_df[real_value_columns_loan_df]

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
0,6608,0,137
1,4226,1040,110
2,3167,2283,154
3,6950,0,175
4,3993,3274,207
...,...,...,...
293,3846,0,111
294,2435,0,75
295,4923,0,166
296,2071,754,94


In [46]:
loan_df[categorical_columns_loan_df].describe()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
count,298,298,298,298,298,298,298,298,298
unique,2,2,4,2,2,9,2,3,2
top,0,1,0,1,0,360,1,2,1
freq,245,187,176,235,258,248,231,115,150


In [47]:
sc = StandardScaler()
loan_df[real_value_columns_loan_df] = sc.fit_transform(loan_df[real_value_columns_loan_df])

### step 3: Features & target column

### features: columns for judgement
#### target :  result column to be predicted by the model

In [48]:
# remove Loan_Status from list of columns
categorical_columns_loan_df.remove(  "Loan_Status"  )

#put it into a separet list
target = ["Loan_Status"]

#remaining real & categorial columns can now be combined into a list called features
features = categorical_columns_loan_df

#### step 4 : Separate dataset into training and testing test

In [49]:
X_train, X_test, y_train, y_test = train_test_split(  
    loan_df[features],loan_df[target],
    test_size=0.2,
    random_state=10,
    stratify = loan_df[target]
)

#### HYPEROPT!!

Age     Income   Loan_Amount     Status(target column)

0.1   0.6  0.3 (weights)


L1 regularization : It helps the model to elimiate non-important features completely from the process of generating result.
    Advantages:
        1) Reduce features to be used for final answers.
        2) Identify how important a feature is to the end result

L2 regularization: Solves the problem of overfitting (model should not show great performance in training and significantly become inaccurate during testing)

C (learning rate???)



Age: less important
Income & Loan_Amount is more important.

If model is able to do this, accuracy of the model

### HYPEROPT: 

    #1) Create a search space: A set of values to be TRIED for various parameters
    #2) Create an objective function that will work on minimization principle to find the best model
    #3) Apply the objective function on the search space

Note:

    a)Keys need to match with parameters to be adjusted
    b) if there are options to choose from (a set of values), use hp.choice function
        but
        if there is a range of values in mind for a certain parameter choose hp.uniform

## how to set max_evals

#step 1: identify hp.choice parameters OPTIONS
            max_features(2), criterion(3) & splitter(2)
                    Total = 7

# step 2: 
        identify 
                hp.uniform, hp.quniform, hp.loguniform, hp.qloguniform
            JUST COUNT OCCURRENCES
            e.g: 1 occurence of such function hp.uniform in min_impurity_decrease
                Total = 1
            
# step 3: 
            step 2 number * 20 and step 1 number * 15
                    20 *1   and  15 * 7

                    =125


1)Non-categorical choices lines (0)

2) How many possible entries for categorical hyperparameters are present in the search space (15)


Non-categorical count from step 1 *   20     +      categorical items count from step 2 * 15

=  0 * 20 +   15 * 15 = 225

In [50]:
#keys of this dictionary are names of parameters
space ={
    "max_features" : hp.choice("feature choice", ["sqrt", "log2"]),
    'max_depth' : hp.choice(  'depth parameter',[1,2,3,4,5,6,7,8]  ), #choose some number between 1 to 8
    'criterion' : hp.choice(  'criteria parameter' ,   ['gini',"entropy", 'log_loss'] ),
    'splitter' : hp.choice("splitter choice", ["best", "random"]),
    'min_impurity_decrease' : hp.uniform("impurity factor", 0, 0.02)
}

step 1: count lines where non-categorical choices have been made? (1)
step 2: count entries in lines where hp.choice has been used = 15


ans = non-categorical count multiplied by 20    plus  categorical items count multiplied by 15


    =    1    *   20       +      15    *  15
    = 245

In [51]:
# trial 1: max_depth: 4, criterion: gini, splitter : "best" ------------> -57
# trial 2: max_depth : 7, criterion : gini, splitter : "random"-----------> -70

In [52]:
scores=[]
def objective(space):
    model = DecisionTreeClassifier(**space)

    model.fit(X_train, y_train)

    pred = model.predict(X_test)

    accuracy = accuracy_score(y_test,pred) #-57 -70: hyperopt selects -70


    scores.append(accuracy)
    return {'loss': -accuracy,'status':STATUS_OK}

best feature: 
            i) ENTROPY
            ii) GINI

In [53]:
trials = Trials() #create a database (in-memory)

      #fmin is the function to be used for minimum optimization
best = fmin(fn=objective, #function
            space=space, #search space parameters
            algo=tpe.suggest, #next combination has to be picked intelligently(bayesian optimization)
            max_evals=125,
            early_stop_fn=no_progress_loss(  iteration_stop_count=25, percent_increase=0.001 ), 
            trials=trials)

print (space_eval(space, best))

 24%|██▍       | 30/125 [00:00<00:01, 68.21trial/s, best loss: -0.7666666666666667]
{'criterion': 'gini', 'max_features': 'sqrt', 'min_impurity_decrease': 0.01081665003522246, 'splitter': 'best'}


In [54]:
for entry in trials:
    print(entry)
    break

{'state': 2, 'tid': 0, 'spec': None, 'result': {'loss': -0.5, 'status': 'ok'}, 'misc': {'tid': 0, 'cmd': ('domain_attachment', 'FMinIter_Domain'), 'workdir': None, 'idxs': {'criteria parameter': [0], 'feature choice': [0], 'impurity factor': [0], 'splitter choice': [0]}, 'vals': {'criteria parameter': [0], 'feature choice': [0], 'impurity factor': [0.0064697032313749195], 'splitter choice': [0]}}, 'exp_key': None, 'owner': None, 'version': 0, 'book_time': datetime.datetime(2023, 2, 28, 9, 29, 54, 536000), 'refresh_time': datetime.datetime(2023, 2, 28, 9, 29, 54, 547000)}
