In [1]:
import pandas as pd
import numpy as np
import copy

import src.utils as utils

## Load config

In [2]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw',
 'data_set_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_columns_path': 'data/output/input_columns.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_column': 'product_id',
 'seed': 42,
 'test_size': 0.2,
 'standardizer_path': 'data/output/standardizer.pkl',
 'preprocessor_path': 'data/output/preprocessor.pkl',
 'train_clean_path': ['data/output/X_train_clean.pkl',
  'data/output/y_train_clean.pkl'],
 'valid_clean_path': ['data/output/X_valid_clean.pkl',
  'data/output/y_valid_clean.pkl'],
 'test_clean_path': ['data/output/X_test_clean.pkl',
  'data/output/y_test_clean.pkl'],
 'list_of_model_path': 'log/list_of_model.pkl',
 'list_of_param_path': 'log/list_of_param.pkl',
 'list_of_tuned_mod

## Buat Model

List of model
- KNN
- Logistic Regression
- Random Forest
- XGBoost

In [3]:
test = utils.pickle_load(CONFIG_DATA['test_clean_path'][1])
test.head()

3204    102
2094    102
2698    103
6071    105
346     102
Name: product_id, dtype: int64

In [3]:
# define param
def create_model_param():
    """Create the model objects"""
    knn_params = {
        'n_neighbors': [50, 100, 200],
    }
    
    rf_params = {
        "n_estimators" : [i for i in range(50, 151, 30)],
        "min_samples_split" : [2, 4, 6, 8],
        "criterion" : ["gini", "entropy", "log_loss"]
    }

    lgr_params = {
        # 'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1],
        'max_iter': [100, 300, 500]
    }

    xgb_params = {
        'n_estimators': [5, 10, 25, 50]
    }

    gbc_params = {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
    
    lgbm_params = {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 63, 127],
        'min_child_samples': [20, 40, 60]
    }


    # Create model params
    list_of_param = {
        'KNeighborsClassifier': knn_params,
        'RandomForestClassifier': rf_params,
        'LogisticRegression': lgr_params,
        'XGBClassifier': xgb_params,
        'GradientBoostingClassifier': gbc_params,
        'LGBMClassifier': lgbm_params
    }

    return list_of_param


In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [5]:
def create_model_object():
    """Create the model objects"""
    print("Creating model objects")

    # Create model objects
    rf = RandomForestClassifier()
    xgb = XGBClassifier()

    # Create list of model
    list_of_model = [
        {'model_name': rf.__class__.__name__, 'model_object': rf},
        {'model_name': xgb.__class__.__name__, 'model_object': xgb}
    ]

    return list_of_model


Cross validation

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [7]:
list_of_param = create_model_param()
list_of_model = create_model_object()

Creating model objects


In [8]:
list_of_param

{'KNeighborsClassifier': {'n_neighbors': [50, 100, 200]},
 'RandomForestClassifier': {'n_estimators': [50, 80, 110, 140],
  'min_samples_split': [2, 4, 6, 8],
  'criterion': ['gini', 'entropy', 'log_loss']},
 'LogisticRegression': {'C': [0.01, 0.1], 'max_iter': [100, 300, 500]},
 'XGBClassifier': {'n_estimators': [5, 10, 25, 50]},
 'GradientBoostingClassifier': {'n_estimators': [50, 100, 150],
  'learning_rate': [0.01, 0.1, 0.2],
  'max_depth': [3, 5, 7]},
 'LGBMClassifier': {'n_estimators': [50, 100, 150],
  'learning_rate': [0.01, 0.1, 0.2],
  'num_leaves': [31, 63, 127],
  'min_child_samples': [20, 40, 60]}}

In [9]:
list_of_model

[{'model_name': 'KNeighborsClassifier',
  'model_object': KNeighborsClassifier()},
 {'model_name': 'RandomForestClassifier',
  'model_object': RandomForestClassifier()},
 {'model_name': 'LogisticRegression',
  'model_object': LogisticRegression(solver='sag')},
 {'model_name': 'XGBClassifier',
  'model_object': XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=None, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                multi_strategy=None, n_esti

In [14]:
def train_model(return_file=True):
    """Function to get the best model"""
    # Load dataset
    X_train = utils.pickle_load(CONFIG_DATA['train_clean_path'][0])
    y_train = utils.pickle_load(CONFIG_DATA['train_clean_path'][1])
    X_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][0])
    y_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][1])
    
    # Create list of params & models
    list_of_param = create_model_param()
    list_of_model = create_model_object()

    # List of trained model
    list_of_tuned_model = {}

    # Train model
    for base_model in list_of_model:
        # Current condition
        model_name = base_model['model_name']
        model_obj = copy.deepcopy(base_model['model_object'])
        model_param = list_of_param[model_name]

        # Debug message
        print('Training model :', model_name)

        # Create model object
        model = RandomizedSearchCV(estimator = model_obj,
                                   param_distributions = model_param,
                                   n_iter=5,
                                   cv = 5,
                                   random_state = 123,
                                   n_jobs=1,
                                   verbose=10,
                                   scoring = 'roc_auc')
        
        # Train model
        model.fit(X_train, y_train)

        # Predict
        # y_pred_proba_train = model.predict_proba(X_train)[:, 1]
        # y_pred_proba_valid = model.predict_proba(X_valid)[:, 1]
        
        # Prediksi probabilitas & calculate AUC score
        y_pred_proba_train = model.predict_proba(X_train)
        y_pred_proba_valid = model.predict_proba(X_valid)

        # Menghitung AUC dengan asumsi klasifikasi multi-kelas
        
        ''' 
        One vs One: OvO lebih cocok untuk dataset yang lebih kecil karena lebih sedikit data yang digunakan pada masing-masing klasifikator, 
        sementara One-vs-Rest (OvR) / One-vs-All (OvA) lebih efisien dalam hal waktu komputasi untuk dataset yang besar atau dengan banyak kelas.
        '''
        train_score = roc_auc_score(y_train, y_pred_proba_train, multi_class="ovr", average='macro')
        valid_score = roc_auc_score(y_valid, y_pred_proba_valid, multi_class="ovr", average='macro')
        
        # Get score
        # train_score = roc_auc_score(y_train, y_pred_proba_train)
        # valid_score = roc_auc_score(y_valid, y_pred_proba_valid)

        # Append
        list_of_tuned_model[model_name] = {
            'model': model,
            'train_auc': train_score,
            'valid_auc': valid_score,
            'best_params': model.best_params_
        }

        print("Done training")
        print("")

    # Dump data
    utils.pickle_dump(list_of_param, CONFIG_DATA['list_of_param_path'])
    utils.pickle_dump(list_of_model, CONFIG_DATA['list_of_model_path'])
    utils.pickle_dump(list_of_tuned_model, CONFIG_DATA['list_of_tuned_model_path'])

    if return_file:
        return list_of_param, list_of_model, list_of_tuned_model    

In [None]:
list_of_param, list_of_model, list_of_tuned_model = train_model()

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report, accuracy_score
# from sklearn.preprocessing import LabelEncoder

# # Preparing data for modeling
# # Encoding categorical variables
# encoder = LabelEncoder()
# data['category_encoded'] = encoder.fit_transform(data['category'])

# # Selecting features and target
# features = data[['page_views', 'time_spent', 'price', 'ratings', 'category_encoded']]
# target = data['product_id']

# # Splitting the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# # Initialize and train the RandomForest Classifier
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)

# # Predicting on the test set
# y_pred = model.predict(X_test)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)