In [1]:
import pandas as pd
import numpy as np
import copy

import src.utils as utils

## Load config

In [2]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw',
 'data_set_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_columns_path': 'data/output/input_columns.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_column': 'category_encoded',
 'seed': 42,
 'test_size': 0.2,
 'standardizer_path': 'data/output/standardizer.pkl',
 'preprocessor_path': 'data/output/preprocessor.pkl',
 'train_clean_path': ['data/output/X_train_clean.pkl',
  'data/output/y_train_clean.pkl'],
 'valid_clean_path': ['data/output/X_valid_clean.pkl',
  'data/output/y_valid_clean.pkl'],
 'test_clean_path': ['data/output/X_test_clean.pkl',
  'data/output/y_test_clean.pkl'],
 'list_of_model_path': 'log/list_of_model.pkl',
 'list_of_param_path': 'log/list_of_param.pkl',
 'list_of_tun

## Buat Model

List of model
- Random Forest
- XGBoost

In [3]:
test = utils.pickle_load(CONFIG_DATA['test_clean_path'][1])
test.head()

231    0
374    3
55     1
381    3
70     3
dtype: int64

In [4]:
# define param
def create_model_param():
    """Create the model objects"""
    # knn_params = {
    #     'n_neighbors': [50, 100, 200],
    # }
    
    rf_params = {
        "n_estimators" : [i for i in range(50, 151, 30)],
        "min_samples_split" : [2, 4, 6, 8],
        "criterion" : ["gini", "entropy", "log_loss"]
    }

    # lgr_params = {
    #     # 'penalty': ['l1', 'l2'],
    #     'C': [0.01, 0.1],
    #     'max_iter': [100, 300, 500]
    # }

    xgb_params = {
        'n_estimators': [5, 10, 25, 50]
    }

    # gbc_params = {
    #     'n_estimators': [50, 100, 150],
    #     'learning_rate': [0.01, 0.1, 0.2],
    #     'max_depth': [3, 5, 7]
    # }
    
    # lgbm_params = {
    #     'n_estimators': [50, 100, 150],
    #     'learning_rate': [0.01, 0.1, 0.2],
    #     'num_leaves': [31, 63, 127],
    #     'min_child_samples': [20, 40, 60]
    # }


    # Create model params
    list_of_param = {
        # 'KNeighborsClassifier': knn_params,
        'RandomForestClassifier': rf_params,
        # 'LogisticRegression': lgr_params,
        'XGBClassifier': xgb_params,
        # 'GradientBoostingClassifier': gbc_params,
        # 'LGBMClassifier': lgbm_params
    }

    return list_of_param


In [5]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [6]:
def create_model_object():
    """Create the model objects"""
    print("Creating model objects")

    # Create model objects
    rf = RandomForestClassifier()
    xgb = XGBClassifier()

    # Create list of model
    list_of_model = [
        {'model_name': rf.__class__.__name__, 'model_object': rf},
        {'model_name': xgb.__class__.__name__, 'model_object': xgb}
    ]

    return list_of_model


Cross validation

In [7]:
from sklearn.model_selection import RandomizedSearchCV

In [8]:
list_of_param = create_model_param()
list_of_model = create_model_object()

Creating model objects


In [9]:
list_of_param

{'RandomForestClassifier': {'n_estimators': [50, 80, 110, 140],
  'min_samples_split': [2, 4, 6, 8],
  'criterion': ['gini', 'entropy', 'log_loss']},
 'XGBClassifier': {'n_estimators': [5, 10, 25, 50]}}

In [10]:
list_of_model

[{'model_name': 'RandomForestClassifier',
  'model_object': RandomForestClassifier()},
 {'model_name': 'XGBClassifier',
  'model_object': XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=None, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                multi_strategy=None, n_estimators=None, n_jobs=None,
                num_parallel_tree=None, random_state=None, ...)}]

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score

def train_model(return_file=True):
    """Function to get the best model"""
    # Load dataset
    X_train = utils.pickle_load(CONFIG_DATA['train_clean_path'][0])
    y_train = utils.pickle_load(CONFIG_DATA['train_clean_path'][1])
    X_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][0])
    y_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][1])
    
    # Create list of params & models
    list_of_param = create_model_param()
    list_of_model = create_model_object()

    # List of trained model
    list_of_tuned_model = {}
    
    # Define StratifiedKFold for cross-validation
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123) 

    # Train model
    for base_model in list_of_model:
        # Current condition
        model_name = base_model['model_name']
        model_obj = copy.deepcopy(base_model['model_object'])
        model_param = list_of_param[model_name]

        # Debug message
        print('Training model :', model_name)

        # Create model object
        model = RandomizedSearchCV(estimator = model_obj,
                                   param_distributions = model_param,
                                   n_iter=5,
                                   cv=skf,  # Use StratifiedKFold
                                   random_state = 123,
                                   n_jobs=1,
                                   verbose=10,
                                   scoring = 'accuracy')
        
        # Train model
        model.fit(X_train, y_train)

        # Predict
        # y_pred_proba_train = model.predict_proba(X_train)[:, 1]
        # y_pred_proba_valid = model.predict_proba(X_valid)[:, 1]
        
        # Prediksi probabilitas & calculate AUC score
        y_pred_train = model.predict(X_train)
        y_pred_valid = model.predict(X_valid)

        # Menghitung AUC dengan asumsi klasifikasi multi-kelas
        
        ''' 
        One vs One: OvO lebih cocok untuk dataset yang lebih kecil karena lebih sedikit data yang digunakan pada masing-masing klasifikator, 
        sementara One-vs-Rest (OvR) / One-vs-All (OvA) lebih efisien dalam hal waktu komputasi untuk dataset yang besar atau dengan banyak kelas.
        '''
        cm_train = confusion_matrix(y_train, y_pred_train)
        cm_valid = confusion_matrix(y_valid, y_pred_valid)
        acc_train = accuracy_score(y_train, y_pred_train)
        acc_valid = accuracy_score(y_valid, y_pred_valid)

        # Append
        list_of_tuned_model[model_name] = {
            'model': model,
            'train_cm': cm_train,
            'valid_cm': cm_valid,
            'train_accuracy': acc_train,
            'valid_accuracy': acc_valid,
            'best_params': model.best_params_
        }

        print("Done training")
        print("")

    # Dump data
    utils.pickle_dump(list_of_param, CONFIG_DATA['list_of_param_path'])
    utils.pickle_dump(list_of_model, CONFIG_DATA['list_of_model_path'])
    utils.pickle_dump(list_of_tuned_model, CONFIG_DATA['list_of_tuned_model_path'])

    if return_file:
        return list_of_param, list_of_model, list_of_tuned_model    

In [14]:
list_of_param, list_of_model, list_of_tuned_model = train_model()

Creating model objects
Training model : RandomForestClassifier
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3; 1/5] START criterion=entropy, min_samples_split=2, n_estimators=110....
[CV 1/3; 1/5] END criterion=entropy, min_samples_split=2, n_estimators=110;, score=0.908 total time=   0.0s
[CV 2/3; 1/5] START criterion=entropy, min_samples_split=2, n_estimators=110....
[CV 2/3; 1/5] END criterion=entropy, min_samples_split=2, n_estimators=110;, score=0.974 total time=   0.0s
[CV 3/3; 1/5] START criterion=entropy, min_samples_split=2, n_estimators=110....
[CV 3/3; 1/5] END criterion=entropy, min_samples_split=2, n_estimators=110;, score=0.842 total time=   0.0s
[CV 1/3; 2/5] START criterion=gini, min_samples_split=8, n_estimators=80........
[CV 1/3; 2/5] END criterion=gini, min_samples_split=8, n_estimators=80;, score=0.921 total time=   0.0s
[CV 2/3; 2/5] START criterion=gini, min_samples_split=8, n_estimators=80........
[CV 2/3; 2/5] END criterion=gini, min_sample



[CV 2/3; 2/4] END ..............n_estimators=10;, score=0.934 total time=   0.0s
[CV 3/3; 2/4] START n_estimators=10.............................................
[CV 3/3; 2/4] END ..............n_estimators=10;, score=0.908 total time=   0.0s
[CV 1/3; 3/4] START n_estimators=25.............................................
[CV 1/3; 3/4] END ..............n_estimators=25;, score=0.934 total time=   0.0s
[CV 2/3; 3/4] START n_estimators=25.............................................
[CV 2/3; 3/4] END ..............n_estimators=25;, score=0.974 total time=   0.0s
[CV 3/3; 3/4] START n_estimators=25.............................................
[CV 3/3; 3/4] END ..............n_estimators=25;, score=0.895 total time=   0.0s
[CV 1/3; 4/4] START n_estimators=50.............................................
[CV 1/3; 4/4] END ..............n_estimators=50;, score=0.921 total time=   0.0s
[CV 2/3; 4/4] START n_estimators=50.............................................
[CV 2/3; 4/4] END ..........

In [15]:
list_of_tuned_model

{'RandomForestClassifier': {'model': RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=123, shuffle=True),
                     estimator=RandomForestClassifier(), n_iter=5, n_jobs=1,
                     param_distributions={'criterion': ['gini', 'entropy',
                                                        'log_loss'],
                                          'min_samples_split': [2, 4, 6, 8],
                                          'n_estimators': [50, 80, 110, 140]},
                     random_state=123, scoring='accuracy', verbose=10),
  'train_cm': array([[57,  0,  0,  0],
         [ 0, 57,  0,  0],
         [ 0,  1, 55,  1],
         [ 0,  0,  0, 57]], dtype=int64),
  'valid_cm': array([[ 8,  0,  0,  0],
         [ 0, 20,  3,  0],
         [ 0,  0, 30,  1],
         [ 0,  2,  1, 24]], dtype=int64),
  'train_accuracy': 0.9912280701754386,
  'valid_accuracy': 0.9213483146067416,
  'best_params': {'n_estimators': 110,
   'min_samples_split': 6,
   'criterion':

Get best model

In [16]:
def get_best_model(return_file=True):
    """Function to get the best model based on validation accuracy"""
    # Load tuned model
    list_of_tuned_model = utils.pickle_load(CONFIG_DATA['list_of_tuned_model_path'])

    # Get the best model based on validation accuracy
    best_model_name = None
    best_model = None
    best_performance = -1  # Initialize to -1 since accuracy cannot be negative
    best_model_param = None

    for model_name, model_info in list_of_tuned_model.items():
        # Extract validation accuracy
        valid_accuracy = model_info['valid_accuracy']
        
        if valid_accuracy > best_performance:
            best_model_name = model_name
            best_model = model_info['model']
            best_performance = valid_accuracy
            best_model_param = model_info['best_params']

    # Dump the best model
    utils.pickle_dump(best_model, CONFIG_DATA['best_model_path'])

    # Print the summary of the best model
    print('=============================================')
    print('Best model        :', best_model_name)
    print('Validation accuracy:', best_performance)
    print('Best model params :', best_model_param)
    print('=============================================')

    if return_file:
        return best_model

In [17]:
best_model = get_best_model()

Best model        : XGBClassifier
Validation accuracy: 0.9550561797752809
Best model params : {'n_estimators': 25}


## Prediction on test data

In [18]:
X_test = utils.pickle_load(CONFIG_DATA['test_clean_path'][0])
y_test = utils.pickle_load(CONFIG_DATA['test_clean_path'][1])

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Predict
y_pred = best_model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Optional: Calculate accuracy from the confusion matrix
accuracy = accuracy_score(y_test, y_pred)

# Print the confusion matrix and accuracy
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)

Confusion Matrix:
[[26  0  0  0]
 [ 0 21  0  0]
 [ 1  0 25  0]
 [ 1  0  1 36]]
Accuracy: 0.972972972972973


In [20]:
accuracy

0.972972972972973

## Model inference

In [21]:
y_pred

array([0, 3, 1, 2, 3, 3, 3, 0, 3, 2, 3, 3, 2, 3, 2, 0, 2, 0, 0, 3, 3, 3,
       0, 1, 1, 2, 1, 2, 0, 1, 1, 3, 2, 3, 3, 2, 2, 2, 2, 0, 3, 0, 1, 0,
       0, 2, 2, 1, 1, 1, 2, 3, 3, 1, 1, 0, 3, 2, 2, 1, 3, 2, 0, 0, 2, 1,
       0, 1, 3, 0, 2, 0, 0, 3, 1, 3, 0, 3, 2, 0, 1, 2, 2, 3, 3, 1, 0, 3,
       0, 3, 3, 1, 3, 3, 3, 3, 0, 0, 1, 0, 0, 1, 3, 3, 2, 0, 0, 3, 2, 3,
       2], dtype=int64)

In [22]:
y_test

231    0
374    3
55     1
381    3
70     3
      ..
11     3
281    3
22     2
375    3
477    2
Name: category_encoded, Length: 111, dtype: int64

In [28]:
comparison_df = pd.DataFrame({
    'user_id': X_test['customer_id'],
    'y_pred': y_pred,
    'y_test': y_test
})

In [29]:
comparison_df

Unnamed: 0,user_id,y_pred,y_test
231,11,0,0
374,6,3,3
55,16,1,1
381,15,2,3
70,13,3,3
...,...,...,...
11,19,0,3
281,9,3,3
22,7,2,2
375,4,3,3


In [None]:
comparison_df

Unnamed: 0,user_id,y_pred,y_test
231,11,0,0
374,6,3,3
55,16,1,1
381,15,2,3
70,13,3,3
...,...,...,...
11,19,0,3
281,9,3,3
22,7,2,2
375,4,3,3


## Note
dari data dummy disimpulkan sebetulnya bisa dijadika 2 use case:
1. Binary classification: Apakah pelanggan tertentu akan melakukan pembelian dalam periode tertentu atau tidak.
2. Multiclass classification: Produk apa yang akan dibeli oleh pelanggan.