In [2]:
# ! pip install snowflake-connector-python
# ! pip install snowflake-sqlalchemy
# ! pip install xgboost

In [1]:
# importing the required libraries
import pandas as pd
import numpy as np

import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import root_mean_squared_error,mean_absolute_error

import sqlalchemy
import snowflake.connector
from sqlalchemy import create_engine, text
from snowflake.sqlalchemy import *

from datetime import datetime, timedelta
import time
import pytz
tz_ny = pytz.timezone("Asia/Kolkata")

from utils import preprocess_data

import warnings
warnings.filterwarnings('ignore')

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [2]:
# setting up the snowflake credits
ACCOUNT = "qq49985.ap-southeast-1"
USERNAME = "prxdyu"
PASSWORD = "Gala2471xy"

### Retraining 

#### Steps
1. Get the complete data from the database except last 2 weeks data
2. Process and select the best features using the select_features function
3. Build the new model using data from step 1
4. Compare the performance of the existing model and new model on the last 2 weeks data
5. Pick the best one if new model performs best then replace the existing deployed model file with the new model file 

In [11]:
# %%writefile utils.py -a




"""MODEL RETRAINING UTILITY FUNCTIONS"""

# Orders the features in the dataset, if unknown new features are added then it assings 0 for all values in that feature"""
def validate_features(df,features_list):
    
    test = pd.DataFrame()
    for col in features_list:
        if col in df.columns.tolist():
            test[col]=df[col]
        else:
            test[col]=0
    return test

Appending to utils.py


In [12]:
# %%writefile utils.py -a


# Selects the best features for model building
def select_features(df):
    
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    import xgboost as xgb

    
    # splitting the target and input features
    x = df.drop(columns=['LOS'])
    y = df[['LOS']]

    # train test splitting
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,stratify=y)
    

    # 1. DECISION TREE
    
    dtree = DecisionTreeRegressor()
    # fitting the tree
    dtree.fit(x_train,y_train)
    # Checking the feature importance
    zipper = dict(zip(x_train.columns,dtree.feature_importances_))
    # making it as a dataframe
    feature_importances= pd.DataFrame.from_dict(zipper,orient='index').reset_index().rename(columns={"index":"feature",0:"importance"}).sort_values(by='importance',ascending=False)
    # taking the features which have importance greater than threshold 0.01
    dtree_features = feature_importances[feature_importances['importance']>0.01]['feature'].values.tolist()
    
    # 2. XGBOOST
    
    xgb_ = xgb.XGBRegressor()
    xgb_.fit(x_train,y_train)
    xgb_.score(x_train,y_train)
    # Checking the feature importance
    zipper = dict(zip(x_train.columns,xgb_.feature_importances_))
    # making it as a dataframe
    xgb_feature_importances= pd.DataFrame.from_dict(zipper,orient='index').reset_index().rename(columns={"index":"feature",0:"importance"}).sort_values(by='importance',ascending=False)
    # taking the features which have importance greater than threshold 0.01
    xgb_features = feature_importances[xgb_feature_importances['importance']>0.01]['feature'].values.tolist()
    
    
    
    # Joining the features from both dtree and xgboost
    final_features = list(set(dtree_features).union(set(xgb_features)))
    

    # exporting the list of final features for future predictions
    with open('retraining_artifacts/retrained_final_features.pkl', 'wb') as f:
        pickle.dump(final_features, f)
        
    return final_features

Appending to utils.py


In [13]:
# %%writefile utils.py -a


# defining a function to create query to fetch the data for retraining
def get_retraining_query(max_date):
    
    # defining the query (taking old training data from HEALTH_DATA table and new data from PREDICTION_LOGGING table )
    query =f"""

            WITH TRAIN_BASE AS (

                SELECT CASE_ID,
                       COALESCE(HOSPITAL_CODE,0) AS HOSPITAL_CODE,
                       COALESCE(HOSPITAL_TYPE_CODE,'None') AS HOSPITAL_TYPE_CODE,
                       COALESCE(CITY_CODE_HOSPITAL,0) AS CITY_CODE_HOSPITAL,
                       COALESCE(HOSPITAL_REGION_CODE,'None') AS HOSPITAL_REGION_CODE,
                       COALESCE(AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,0) AS AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,
                       COALESCE(DEPARTMENT,'None') AS DEPARTMENT,
                       COALESCE(WARD_TYPE,'None') AS WARD_TYPE,
                       COALESCE(WARD_FACILITY_CODE,'None') AS WARD_FACILITY_CODE,
                       COALESCE(BED_GRADE,0) AS BED_GRADE,
                       PATIENTID,
                       COALESCE(CITY_CODE_PATIENT,0) AS CITY_CODE_PATIENT,
                       COALESCE(TYPE_OF_ADMISSION,'None') AS TYPE_OF_ADMISSION,
                       COALESCE(SEVERITY_OF_ILLNESS,'Minor') AS SEVERITY_OF_ILLNESS,
                       COALESCE(VISITORS_WITH_PATIENT,0) AS VISITORS_WITH_PATIENT,
                       COALESCE(AGE,'None') AS AGE,
                       COALESCE(ADMISSION_DEPOSIT,0) AS ADMISSION_DEPOSIT,
                       ADMISSION_DATE,
                       DISCHARGE_DATE

                FROM HEALTH_DB.PUBLIC.HEALTH_DATA

            ),

            TRAIN_BASE_WITH_FEATURES AS (

                SELECT *,
                        MONTHNAME(ADMISSION_DATE) AS ADMISSION_MONTH,
                        DAYNAME(ADMISSION_DATE) AS ADMISSION_DAY,    
                        CONCAT(TYPE_OF_ADMISSION,'-',SEVERITY_OF_ILLNESS) AS ADMISSION_ILLNESS,
                        CONCAT(SEVERITY_OF_ILLNESS,'-',BED_GRADE) AS ILLNESS_BEDGRADE,
                        CONCAT(DEPARTMENT,'-',SEVERITY_OF_ILLNESS) AS DEPARTMENT_ILLNESS,
                        DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE) AS LOS
                FROM TRAIN_BASE 

            ),
            
            NEW_DATA_WITH_FEATURES AS (
            
                SELECT CASE_ID,
                       COALESCE(HOSPITAL_CODE,0) AS HOSPITAL_CODE,
                       COALESCE(HOSPITAL_TYPE_CODE,'None') AS HOSPITAL_TYPE_CODE,
                       COALESCE(CITY_CODE_HOSPITAL,0) AS CITY_CODE_HOSPITAL,
                       COALESCE(HOSPITAL_REGION_CODE,'None') AS HOSPITAL_REGION_CODE,
                       COALESCE(AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,0) AS AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,
                       COALESCE(DEPARTMENT,'None') AS DEPARTMENT,
                       COALESCE(WARD_TYPE,'None') AS WARD_TYPE,
                       COALESCE(WARD_FACILITY_CODE,'None') AS WARD_FACILITY_CODE,
                       COALESCE(BED_GRADE,0) AS BED_GRADE,
                       PATIENTID,
                       COALESCE(CITY_CODE_PATIENT,0) AS CITY_CODE_PATIENT,
                       COALESCE(TYPE_OF_ADMISSION,'None') AS TYPE_OF_ADMISSION,
                       COALESCE(SEVERITY_OF_ILLNESS,'Minor') AS SEVERITY_OF_ILLNESS,
                       COALESCE(VISITORS_WITH_PATIENT,0) AS VISITORS_WITH_PATIENT,
                       COALESCE(AGE,'None') AS AGE,
                       COALESCE(ADMISSION_DEPOSIT,0) AS ADMISSION_DEPOSIT,
                       ADMISSION_DATE,
                       DISCHARGE_DATE,
                       ADMISSION_MONTH,
                       ADMISSION_DAY,
                       ADMISSION_ILLNESS,
                       ILLNESS_BEDGRADE,
                       DEPARTMENT_ILLNESS,
                       LOS
                FROM HEALTH_DB.PUBLIC.PREDICTION_LOGGING
                WHERE ADMISSION_DATE<='{max_date}'
                
            
            )
            
            SELECT * FROM TRAIN_BASE_WITH_FEATURES
            UNION ALL
            SELECT * FROM NEW_DATA_WITH_FEATURES
           
            """
    return query

Appending to utils.py


In [14]:
# %%writefile utils.py -a


# defining the function for retraining
def retrain_model(cut_off_date):
    
    # Creating the connection engine (way 1)
    engine = create_engine(URL(
            account=ACCOUNT,
            user= USERNAME,
            password= PASSWORD,
            role="ACCOUNTADMIN",
            warehouse="COMPUTE_WH",
            database="HEALTH_DB",
            schema="PUBLIC"
        ))
    
    # getting the query 
    query = get_retraining_query(cut_off_date)
    
    # connecting to the engine
    with engine.connect() as conn:
        result = conn.execute(text(query))
        data = pd.DataFrame(result.fetchall())
        data.columns = result.keys()
        data.columns = [col.upper() for col in data.columns.tolist()]
    
    print("Successfully fetched data from snowflake\n")
    print("Shape of fetched data is ",data.shape)
    
    # defining the max and min dates for data splitting 
    max_date = data['ADMISSION_DATE'].max()
    min_date = max_date-timedelta(days=7)
    
    # splitting the data into train and test
    d_train = data[data['ADMISSION_DATE']<=min_date]
    d_test = data[(data['ADMISSION_DATE']>min_date) & (data['ADMISSION_DATE']<=max_date)]
    
    print("Train, Test split done\n")
        
    # applying the preprocess steps to both train and test
    df_train = preprocess_data(d_train)
    df_test = preprocess_data(d_test)
    
    print("Preprocessing of Train and test data is done\n")
    
    
    # selecting features
    final_features = select_features(df_train)
    df_test_processed = validate_features(df_test,final_features)
    
    print("Feature selection executed\n")

    
    # Model building
    import xgboost as xgb
    from sklearn.metrics import root_mean_squared_error,mean_absolute_error
    
    xgb_ = xgb.XGBRegressor()
    xgb_.fit(df_train[final_features],df_train['LOS'])
    
    # getting the predictions for the test data (last 1 week's data)
    y_test_pred = np.ceil(xgb_.predict(df_test_processed))
    
    # computing the performance metrics
    rmse = root_mean_squared_error(y_test_pred,df_test['LOS'])
    mae = mean_absolute_error(y_test_pred,df_test['LOS'])
    print("Test performance of new retrained model \n")
    print(f"RMSE is {rmse}")
    print(f"MAE is {mae}\n\n")
    
    # storinig the performance metrics for future use
    retrained_model_metrics = dict()
    retrained_model_metrics['RMSE']=rmse
    retrained_model_metrics['MAE']=mae
    import pickle
    with open('retraining_artifacts/retrained_model_metrics.pkl', 'wb') as f:
        pickle.dump(retrained_model_metrics, f)
    
    # saving the model
    booster = xgb_.get_booster()
    booster.save_model('retraining_artifacts/xgb_retrained.model')
    print("Successfully saved the retrained model\n")
    
    # loading the old model
    old_model = xgb.XGBRegressor()
    old_model.load_model('artifacts/xgb.model')
    print("Sucessfully loaded old model")
    
    # loading the selected features for our old model
    with open('artifacts/final_features.pkl','rb') as f:
        final_features_old = pickle.load(f)
    
    # getting predictions the test data (last 1 week's data) from our old model
    df_test_processed_old = validate_features(df_test,final_features_old)
    y_test_pred_old = np.ceil(old_model.predict(df_test_processed_old))
    
    # computing the performance metrics
    rmse_old = root_mean_squared_error(y_test_pred_old,df_test['LOS'])
    mae_old = mean_absolute_error(y_test_pred_old,df_test['LOS'])
    print("Test performance of old existing model")
    print(f"RMSE is {rmse_old}")
    print(f"MAE is {mae_old}")
    
    # storinig the performance metrics for future use
    old_model_metrics = dict()
    old_model_metrics['RMSE']=rmse_old
    old_model_metrics['MAE']=mae_old
    
    return retrained_model_metrics,old_model_metrics


Appending to utils.py


In [15]:
# %%writefile utils.py -a

# defining a function which chooses the model to deploy based on the performance metric
def finalize_model(old_model_metrics,new_model_metrics):
    count=0
    
    # checking if the RMSE and MAE of new model metric is lesser than t
    for metric in new_model_metrics.keys():
        if new_model_metrics[metric]<old_model_metrics[metric]:
            count+=1
    
    if count>0:
        return 'New Model '
    else:
        return 'Old Model'

Appending to utils.py


#### Retraining the model

In [7]:
from utils import retrain_model

# retraining a new model and getting the performance metrics 
old_model_metrics,new_model_metrics = retrain_model('2022-12-02')

Successfully fetched data from snowflake

Shape of fetched data is  (243790, 25)
Train, Test split done

Preprocessing of Train and test data is done

Feature selection executed

Test performance of new retrained model 

RMSE is 11.973041923299403
MAE is 8.614589530041641


Successfully saved the retrained model

Sucessfully loaded old model
Test performance of old existing model
RMSE is 11.859149460941214
MAE is 8.501115407495538


In [9]:
# finalizing which model to deploy
finalize_model(old_model_metrics,new_model_metrics)

'New Model '

In [16]:
%%writefile utils.py -a


# defining the function to deploy the model
def deploy_model(selector="Old Model"):
    
    if selector!="Old Model":
        
        # LOADING THE OLD MODEL ARTIFACTS
        with open('artifacts/final_features.pkl','rb') as f:
            old_model_features = pickle.load(f)
        with open('artifacts/model_ref_metrics.pkl','rb') as f:
            old_model_metrics = pickle.load(f)
        with open('artifacts/xgb.model','rb') as f:
            old_model = pickle.load(f)
        
        
        # MOVING THE OLD MODEL ARTIFACTS TO THE ARCHIVE FOLDER
        with open('archive/old_model_features.pkl','wb') as f:
            pickle.dump(old_model_features,f)
        with open('archive/old_model_metrics.pkl','wb') as f:
            pickle.dump(old_model_metrics,f)
        with open('archive/old_model.model','wb') as f:
            pickle.dump(old_model,f)
            
        # LOADING THE NEW MODEL
        with open('retraining_artifacts/retrained_final_features.pkl','rb') as f:
            new_model_features = pickle.load(f)
        with open('retraining_artifacts/retrained_model_metrics.pkl','rb') as f:
            new_model_metrics = pickle.load(f)
        with open('retraining_artifacts/xgb_retrained.model','rb') as f:
            new_model = pickle.load(f)
            
        
        # REPLACING THE OLD MODEL WITH THE NEW RETRAINED MODEL
        with open('artifacts/final_features.pkl','wb') as f:
            pickle.dump(new_model_features,f)
        with open('artifacts/model_ref_metrics.pkl','wb') as f:
            pickle.dump(new_model_metrics,f)
        with open('artifacts/xgb.model','wb') as f:
            pickle.dump(new_model,f)
            
        print("Deployment New Model Successfully")
        
    else:
        print("Keeping the same Model")
    
    return "Deployment Succesful"
    
            
    

Appending to utils.py
