## Model Developmnet

In [45]:
# %%capture
# %pip install -r ../../requirements.txt

#### Import Packages

In [46]:
import os
import sys
from datetime import datetime
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import cross_val_score, StratifiedKFold
# from bayes_opt import BayesianOptimization
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn import metrics

# Import local libraries
cwd = Path.cwd()  # Current directory
sys.path.append(str(cwd.parents[0]))  # This allows you to import from src
from feature_selection_function import hyperparameter_tuning
from definitions import LocationConfig, PROD_MODEL
lc = LocationConfig()

from model_assessment_functions import (
    corresponding_data,
    preprocess_data,
    conf_matrix,
    prt_values,
    model_tune_category
)

# Filter out the specific warning messages you want to hide
warnings.filterwarnings("ignore", message="lbfgs failed to converge", category=UserWarning)
warnings.filterwarnings("ignore", message="Increase the number of iterations", category=UserWarning)

### Read data for model development from source

In [47]:
%%time
# Read train/test data for model training
df_transformed = pd.read_csv(lc.feature_transformation_output_table)

CPU times: total: 0 ns
Wall time: 14 ms


In [48]:
 # Read the feature list from the table
df_features = pd.read_csv(lc.selected_features_table_name)
features_list = list(df_features['Feature'])

### Preprocess the data

In [49]:
# Select the target column
target_column = 'price_range'

# Gret train, test and OOT period dataframe
X_train, X_test, y_train, y_test, feature_list = preprocess_data(df_transformed, features_list, target_column)

Number of features :  15


In [50]:
test = X_test
test[target_column] = y_test
test.to_csv("test.csv", index=False)

In [51]:
train = X_train
train[target_column] = y_train
train.to_csv("train.csv", index=False)

In [52]:
# %%time
# selected_model = "XGBClassifier"

# # Using sample data to find the best hyperparameters for model training
# n_samples = min(1000, len(X_train))
# X_train_sampled = X_train.sample(n=1000, random_state=1) 
# y_train_sampled = y_train[X_train_sampled.index]
# best_params = hyperparameter_tuning(selected_model, X_train_sampled, y_train_sampled)

### Model Bulid

In [36]:
# Best parameters from ther Hyperparameter tuning
# best_params = 10

In [37]:
%%time
# Create a machine learning pipeline
pipeline = lgb.LGBMClassifier(random_state=42)
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train) 

# Save model to MODELS directory
PROD_MODEL = "C://Users//LivinAlbert//Documents//Explainboard//models//light_gbm.joblib"
joblib.dump(pipeline,PROD_MODEL)


[LightGBM] [Info] Number of positive: 697, number of negative: 703
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 219
[LightGBM] [Info] Number of data points in the train set: 1400, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497857 -> initscore=-0.008571
[LightGBM] [Info] Start training from score -0.008571
CPU times: total: 156 ms
Wall time: 57.5 ms


['C://Users//LivinAlbert//Documents//Explainboard//models//light_gbm.joblib']

In [38]:
%%time
# Create a machine learning pipeline
pipeline = LogisticRegression(random_state=42)
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train) 

# Save model to MODELS directory
PROD_MODEL = "C://Users//LivinAlbert//Documents//Explainboard//models//logistic.joblib"
joblib.dump(pipeline,PROD_MODEL)

CPU times: total: 93.8 ms
Wall time: 27 ms


['C://Users//LivinAlbert//Documents//Explainboard//models//logistic.joblib']

In [39]:
%%time
# Create a machine learning pipeline
pipeline = XGBClassifier(random_state=42)
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train) 

# Save model to MODELS directory
PROD_MODEL = "C://Users//LivinAlbert//Documents//Explainboard//models//xgboost.joblib"
joblib.dump(pipeline,PROD_MODEL)

CPU times: total: 422 ms
Wall time: 81 ms


['C://Users//LivinAlbert//Documents//Explainboard//models//xgboost.joblib']

In [40]:
# Load the Saved model for prediction
# pipeline = joblib.load(PROD_MODEL)

In [43]:
# Predicted values for train, test and oot data
y_test_pred = pipeline.predict(X_test)
y_train_pred = pipeline.predict(X_train)

# Predicted probabilities for train, test and oot data
y_scores_test = pipeline.predict_proba(X_test).round(3)
y_scores_train = pipeline.predict_proba(X_train).round(3)


In [44]:
# Model Performance based on the accuracy scores
# Train accuracy Calculation
accuracy = metrics.accuracy_score(y_train,y_train_pred)
print("Train accuracy:", accuracy)

# Test accuracy calculation
accuracy = metrics.accuracy_score(y_test,y_test_pred)
print("Test accuracy:", accuracy)


Train accuracy: 1.0
Test accuracy: 0.505


#### Train Data Model Tuning

In [12]:
%%time
"""
Get train data prt values
"""
df_prt_values_train = prt_values(y_train, y_scores_train)

CPU times: total: 31.2 ms
Wall time: 15 ms


In [18]:
# Make relaiable to test the multiple thresholds for simpler method
train_data_accuracy = model_tune_category(df_prt_values_train,  threshold = 0.72, data_set_type = 'train')
# Get train threshold for each class
train_threshold = train_data_accuracy['thresholds'][0].round(4)

train_data_accuracy

Unnamed: 0,precision,recall,thresholds,roc_auc
0,1.0,0.720339,0.73,1.0


#### Test Data Model Tuning

In [19]:
%%time
"""
Get test data prt values
"""
df_prt_values_test= prt_values(y_test, y_scores_test)

CPU times: total: 0 ns
Wall time: 7.56 ms


In [21]:
# get the threshold value from train accuracy
test_data_accuracy = model_tune_category(df_prt_values_test,
                                          threshold = train_threshold, data_set_type = 'test'
                                         )
test_data_accuracy

Unnamed: 0,precision,recall,thresholds,roc_auc
0,0.428571,0.041096,0.704,0.517885


#### Model Accuracy Tracking

In [None]:
# join both test/train performance metrics
acc_track = pd.concat([train_data_accuracy,test_data_accuracy], axis=1)
acc_track

### Feature Importance

In [None]:
feature_importance = feature_importance(pipeline, selected_model)
feature_importance