# Modeling

## A. Import Library

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [54]:
import numpy as np     # for calculation
import pandas as pd     # for manipulating DataFrame
import seaborn as sns     # for plotting data
import matplotlib.pyplot as plt     # for plotting data
import yaml     # for interacting with config.yaml  
import src.util as util     # import common function
import os     # library for interacting with directory

In [3]:
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [4]:
from sklearn.metrics import confusion_matrix

## B. Import Data

In [49]:
config = util.load_config()

In [50]:
config

{'raw_dataset_dir': 'data/raw/',
 'raw_data_path': 'data/processed/raw_data.pkl',
 'data_train_path': ['data/processed/X_train_clean.pkl',
  'data/processed/y_train_clean.pkl'],
 'data_test_path': ['data/processed/X_test_clean.pkl',
  'data/processed/y_test_clean.pkl'],
 'data_modeling_path': ['data/processed/train_data_modeling.pkl',
  'data/processed/test_data_modeling.pkl'],
 'ohe_train_path': 'models/ohe_train.pkl',
 'standard_scaler_path': 'models/standard_scaler_train.pkl',
 'modeling_summary': 'models/modeling_summary.pkl',
 'production_model': 'models/production_model.pkl'}

In [7]:
train_data = util.load_pickle(file_path=config['data_modeling_path'][0])
test_data = util.load_pickle(file_path=config['data_modeling_path'][1])

In [8]:
train_data

{'X_train': {'unbalance':             GR  ILD_log10  DeltaPHI     PHIND        PE    marine  non_marine
  2720 -0.547132   1.102230 -0.059163 -0.881636  0.431062  0.964766   -0.964766
  3372 -0.484870   0.567029  0.328611 -0.348468 -0.709061  0.964766   -0.964766
  3327  1.173602  -0.352723 -0.103083 -0.001143 -0.600875  0.964766   -0.964766
  2756 -0.849761   0.281589 -0.659780 -0.437561  2.095475  0.964766   -0.964766
  3299 -0.832740   0.836612 -1.020339 -1.094904  0.930386  0.964766   -0.964766
  ...        ...        ...       ...       ...       ...       ...         ...
  3993  0.818616  -0.590590  1.367305  0.818795 -0.721831 -1.036521    1.036521
  2579 -1.297748   0.023899  0.034684 -0.930359  0.549948  0.964766   -0.964766
  485  -0.330627   0.959510 -0.359472 -1.146132  0.431062  0.964766   -0.964766
  1214  0.176967  -1.944712  0.147300  0.433884 -0.986067 -1.036521    1.036521
  1966  0.096594   1.205306 -0.040393 -1.121075  1.619928 -1.036521    1.036521
  
  [3319 rows 

In [10]:
test_data

{'X_test':             GR  ILD_log10  DeltaPHI     PHIND        PE    marine  non_marine
 1226  0.050196  -0.600855 -3.231176  2.062623 -1.128731  0.964766   -0.964766
 2561  0.057447  -0.221897  0.015915 -0.352644 -0.757805 -1.036521    1.036521
 1643 -0.092540   0.535313  0.482895 -0.561874 -0.634052 -1.036521    1.036521
 1122  1.112974  -0.209340 -2.105017  1.603235 -0.807737 -1.036521    1.036521
 4077 -0.411646   0.995190  0.053453 -0.651942  0.590860  0.964766   -0.964766
 ...        ...        ...       ...       ...       ...       ...         ...
 4099  0.228676   0.412416  0.147300 -0.076315  0.081348  0.964766   -0.964766
 3422 -1.076342   1.431279 -0.302413 -0.689389  0.743734  0.964766   -0.964766
 3204  0.218463  -0.154501 -2.743174  1.564953 -0.995578 -1.036521    1.036521
 382  -1.031679   1.098265 -0.939631 -1.113975  0.437006  0.964766   -0.964766
 1401  0.008495  -0.285328  0.359018 -0.263133 -0.721831 -1.036521    1.036521
 
 [830 rows x 7 columns],
 'y_test': 1226

## B. Create Baseline Model

In [11]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X=train_data["X_train"]["unbalance"],
              y=train_data["y_train"]["unbalance"])
y_pred_dummy = dummy_clf.predict(train_data["X_train"]["unbalance"])

In [12]:
dummy_conf_matrix = confusion_matrix(y_true=train_data["y_train"]["unbalance"], 
                                     y_pred=y_pred_dummy)
util.display_cm(dummy_conf_matrix, hide_zeros=False, display_metrics=True)
print('Facies classification accuracy = %.2f' % util.accuracy(dummy_conf_matrix))
print("---------------------------------------------------------------------")
util.display_adj_cm(cm=dummy_conf_matrix)
print('Adjacent facies classification accuracy = %.2f' % util.accuracy_adjacent(dummy_conf_matrix))

     Pred    SS  CSiS  FSiS  SiSh    MS    WS     D    PS    BS Total
     True
       SS     0   214     0     0     0     0     0     0     0   214
     CSiS     0   752     0     0     0     0     0     0     0   752
     FSiS     0   624     0     0     0     0     0     0     0   624
     SiSh     0   217     0     0     0     0     0     0     0   217
       MS     0   237     0     0     0     0     0     0     0   237
       WS     0   465     0     0     0     0     0     0     0   465
        D     0   113     0     0     0     0     0     0     0   113
       PS     0   549     0     0     0     0     0     0     0   549
       BS     0   148     0     0     0     0     0     0     0   148

Precision  0.00  0.23  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.05
   Recall  0.00  1.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.23
       F1  0.00  0.37  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.08
Facies classification accuracy = 0.23
-----------------------------------------

from the baseline model score, we need to create model which have accuracy more than 0.23 and adjacent accuracy for more than 0.48

## C. Create Model Object

In [24]:
def create_model_object() -> list:
    knn = KNeighborsClassifier()       # create object k-nearest neighbors classifier
    dct = DecisionTreeClassifier()     # create object decision tree classifier
    logreg = LogisticRegression()      # create object logistic regression
    svm = SVC()                        # create object support vector machine
    rfc = RandomForestClassifier()     # create object random forest classifier
    xgb = XGBClassifier()              # create object extreme gradient boosting classifier
    
    list_of_model = [
        {"model_name" : knn.__class__.__name__, "model_object":knn},
        {"model_name" : dct.__class__.__name__, "model_object":dct},
        {"model_name" : logreg.__class__.__name__, "model_object":logreg},
        {"model_name" : svm.__class__.__name__, "model_object":svm},
        {"model_name" : rfc.__class__.__name__, "model_object":rfc},
        {"model_name" : xgb.__class__.__name__, "model_object":xgb}]
    return list_of_model

In [25]:
model = create_model_object()
model

[{'model_name': 'KNeighborsClassifier',
  'model_object': KNeighborsClassifier()},
 {'model_name': 'DecisionTreeClassifier',
  'model_object': DecisionTreeClassifier()},
 {'model_name': 'LogisticRegression', 'model_object': LogisticRegression()},
 {'model_name': 'SVC', 'model_object': SVC()},
 {'model_name': 'RandomForestClassifier',
  'model_object': RandomForestClassifier()},
 {'model_name': 'XGBClassifier',
  'model_object': XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=None, max_leaves=None,
  

In [15]:
def model_hyperparameter(model_name:str) -> dict:
    knn_hyper_parameter = {
        "algorithm" : ["ball_tree", "kd_tree", "brute"],
        "n_neighbors" : [2, 5, 10, 25],
        "leaf_size" : [2, 5, 10, 25],
    }
    dct_hyper_parameter = {
        "criterion" : ["gini", "entropy", "log_loss"],
        "max_depth" : [1,5,10],
        "min_samples_split" : [2, 5, 10],
        "min_samples_leaf" : [1, 2, 4]
    }
    logreg_hyper_parameter = {
        "penalty" : ["l2","l1","elasticnet"],
        "C" : [0.01, 0.05, 0.1, 0.5, 1, 5, 10],
        "max_iter" : np.arange(100,210,10)    
    }
    svm_hyper_parameter = {
        "C" : [0.001, 0.05, 0.1, 1, 5, 10, 25, 50],
        "kernel" : ["linear","rbf"]
    }
    rfc_hyper_parameter = {
        "criterion" : ["gini", "entropy", "log_loss"],
        "n_estimators" : [1,5,10],
        "min_samples_split" : [2, 5, 10],
        "min_samples_leaf" : [1, 2, 4]
    }
    xgb_hyper_parameter = {
        "n_estimators" : [1,5,10,25,50,100]
    }
        
    list_of_hyper_parameter = {
        "KNeighborsClassifier" : knn_hyper_parameter,
        "DecisionTreeClassifier" : dct_hyper_parameter,
        "LogisticRegression" : logreg_hyper_parameter,
        "SVC" : svm_hyper_parameter,
        "RandomForestClassifier" : rfc_hyper_parameter,
        "XGBClassifier" : xgb_hyper_parameter
    }
    
    return list_of_hyper_parameter[model_name]

In [65]:
def brute_force_modeling(train_data:dict = train_data, test_data:dict = test_data) -> dict:
    print("========== Starting Train Model ==========")

    list_of_model = create_model_object()
    modeling_summary = {}

    for model in list_of_model:
        print(f"-----Start Training model for {model['model_name']}-----")
        # create variable for storing indformation
        model_summary = {}
        model_summary['model_highest_accuracy']=0 

        for data_type in ["unbalance","rus","ros","smote"]:
            # Experiment by GridSearch CV
            print(f"Training Model {model['model_name']} for {data_type} data...")
            model_cv = GridSearchCV(estimator = model['model_object'], param_grid=model_hyperparameter(model['model_name']), cv=10)
            model_cv.fit(X = train_data["X_train"][data_type].values, y=train_data["y_train"][data_type])
        

            # Create the model with best params
            # KNeighborsClassifier Modeling
            if model['model_name'] == 'KNeighborsClassifier' :
                model_train = KNeighborsClassifier(algorithm = model_cv.best_params_['algorithm'],
                                            n_neighbors = model_cv.best_params_['n_neighbors'],
                                            leaf_size = model_cv.best_params_['leaf_size'])
            # DecisionTreeClassifier Modeling
            elif model['model_name'] == "DecisionTreeClassifier" :
                model_train = DecisionTreeClassifier(criterion = model_cv.best_params_['criterion'],
                                               max_depth = model_cv.best_params_['max_depth'],
                                               min_samples_leaf = model_cv.best_params_['min_samples_leaf'],
                                               min_samples_split = model_cv.best_params_['min_samples_split'])
            # LogisticRegression Modeling 
            elif model['model_name'] == 'LogisticRegression' :
                model_train = LogisticRegression(penalty = model_cv.best_params_["penalty"],
                                           max_iter = model_cv.best_params_["max_iter"],
                                           C = model_cv.best_params_["C"])
            # SVC Modeling
            elif model['model_name'] == 'SVC' :
                model_train = SVC(C = model_cv.best_params_["C"],
                            kernel = model_cv.best_params_["kernel"])
            # RandomForestClassifier Modeling
            elif model['model_name'] == 'RandomForestClassifier' :
                model_train = RandomForestClassifier(n_estimators = model_cv.best_params_["n_estimators"],
                                               criterion = model_cv.best_params_["criterion"],
                                               min_samples_leaf = model_cv.best_params_["min_samples_leaf"],
                                               min_samples_split = model_cv.best_params_['min_samples_split'])                
            # XGBClassifier Modeling
            else :
                model_train = XGBClassifier(n_estimators=model_cv.best_params_["n_estimators"],
                                            num_class=9)


            # Fit best model to train data
            model_train.fit(X=train_data["X_train"][data_type].values,
                        y=train_data["y_train"][data_type])
            
            # Predict output using train data
            y_train_pred  = model_train.predict(train_data["X_train"][data_type].values)
            
            # create confusion matrix for training data
            conf_train = confusion_matrix(y_true=train_data["y_train"][data_type],
                                        y_pred=y_train_pred)
            
            # Predict output using test data
            y_test_pred = model_train.predict(test_data['X_test'].values)
            
            
            # create confusion matrix for testing data
            conf_test = confusion_matrix(y_true=test_data['y_test'], 
                                        y_pred=y_test_pred)
            
            # summarize the modeling
            model_summary[data_type] = {"model":model_train, 
                                    "cv_score":model_cv.best_score_,
                                    "acc_train":util.accuracy(conf_train), 
                                    "adj_acc_train":util.accuracy_adjacent(conf_train),
                                    "acc_test": util.accuracy(conf_test),
                                    "adj_acc_test":util.accuracy_adjacent(conf_test)}
            
            # get the best model from accuracy value
            if model_summary[data_type]["acc_test"] > model_summary['model_highest_accuracy']:
                model_summary['model_highest_accuracy'] = model_summary[data_type]["acc_test"] 
                model_summary['best_model'] = model_summary[data_type]['model']
                model_summary['best_data'] = data_type
                model_summary['model_best_score'] = {"cv_score" : model_summary[data_type]["cv_score"],
                                                     "acc_test" : model_summary[data_type]["acc_test"],
                                                     "adj_acc_test" : model_summary[data_type]["adj_acc_test"]}

        # put modeling information into a dictionary
        print(f"-----End Training model for {model['model_name']}-----")
        modeling_summary[model['model_name']] = model_summary
    print("========== Ending Train Model ==========")     
    return modeling_summary

In [64]:
def train_model(train_data:pd.DataFrame = train_data, test_data:pd.DataFrame = test_data, retrain_model:bool=False):
    if os.path.exists("models/production_model.pkl"):
        if retrain_model:
            modeling_summary = brute_force_modeling(train_data=train_data, test_data=test_data)
        else :
            modeling_summary = util.load_pickle(config['modeling_summary'])
    else:
        modeling_summary = brute_force_modeling(train_data=train_data, test_data=test_data)
    return modeling_summary  

In [63]:
modeling_summary = train_model(train_data = train_data, test_data = test_data, retrain_model = False)

In [62]:
modeling_summary

{'KNeighborsClassifier': {'model_highest_accuracy': 0.6879518072289157,
  'unbalance': {'model': KNeighborsClassifier(algorithm='ball_tree', leaf_size=2),
   'cv_score': 0.646873293779347,
   'acc_train': 0.7824645977704128,
   'adj_acc_train': 0.9427538415185297,
   'acc_test': 0.6879518072289157,
   'adj_acc_test': 0.9180722891566265},
  'best_model': KNeighborsClassifier(algorithm='ball_tree', leaf_size=2),
  'best_data': 'unbalance',
  'model_best_score': {'cv_score': 0.646873293779347,
   'acc_test': 0.6879518072289157,
   'adj_acc_test': 0.9180722891566265},
  'rus': {'model': KNeighborsClassifier(algorithm='ball_tree', leaf_size=2),
   'cv_score': 0.6204135119394291,
   'acc_train': 0.7453294001966568,
   'adj_acc_train': 0.904621435594887,
   'acc_test': 0.4903614457831325,
   'adj_acc_test': 0.8734939759036144},
  'ros': {'model': KNeighborsClassifier(algorithm='ball_tree', leaf_size=2, n_neighbors=2),
   'cv_score': 0.8577259577145954,
   'acc_train': 0.9491725768321513,
   '

In [41]:
util.dump_pickle(data=modeling_summary, file_path=config['modeling_summary'])

In [46]:
def pick_best_model(modeling_summary:dict):

    model_best_accuracy = 0
    best_model = []
    for model in modeling_summary:
        if modeling_summary[model]['model_highest_accuracy'] > model_best_accuracy:
            model_best_accuracy = modeling_summary[model]['model_highest_accuracy']
            best_model = modeling_summary[model]['best_model']
    
    return best_model

In [51]:
production_model = pick_best_model(modeling_summary=modeling_summary)

In [52]:
production_model

In [53]:
util.dump_pickle(data=production_model, file_path=config['production_model'])

In [56]:
path = "models/production_model.pkl"
print(os.path.exists(path))

True