In [0]:
%pip install xgboost

In [0]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, precision_recall_curve, recall_score, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from pyspark.sql.functions import regexp_replace, col, datediff, lit, expr, date_format, cast, row_number, sum, count, min,max, when, desc, ceil, log
from pyspark.ml.stat import Correlation
from numpy import argmax
from pyspark.sql import Window
from functools import reduce
from operator import concat
import multiprocessing as mp
import time
import math
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

sfUtils = sc._jvm.net.snowflake.spark.snowflake.Utils

In [0]:
%run /Users/mgal254@safeway.com/mg_connectivity

In [0]:
q1 = """
          create or replace table edm_features_prd.scratch_ds.pi_validation_final_mg as (
          select base.*exclude(cal_date,upc,total_pi,primary_shelf_pi,secondary_shelf_pi,backroom_pi),
          regexp_replace(backroom_pi,'O|o',0) as backroom_pi,
          regexp_replace(primary_shelf_pi,'O|o',0) as primary_shelf_pi,
          regexp_replace(secondary_shelf_pi,'O|o',0) as secondary_shelf_pi,
          regexp_replace(total_pi,'O|o',0) as total_pi,
          upc as upc_nbr,to_date(replace(cal_date,'Z ','')) as cal_date, u.smic_category_id
          from EDM_FEATURES_PRD.SCRATCH_DS.STORE_PI_VALIDATION base
          left join edm_views_prd.dw_views.d1_upc u
          on upc = upc_nbr
          )
"""
sfUtils.runQuery(ITDSreadOptions, q1)

q2 = """UPDATE edm_features_prd.scratch_ds.pi_validation_final_mg 
        SET total_pi = NULLIF(total_pi, 'NaN'),
        primary_shelf_pi = NULLIF(primary_shelf_pi, 'NaN'),
        secondary_shelf_pi = NULLIF(secondary_shelf_pi, 'NaN'),
        backroom_pi = NULLIF(backroom_pi, 'NaN')
     """
sfUtils.runQuery(ITDSreadOptions, q2)

In [0]:
store_fdbk_table = 'edm_features_prd.scratch_ds.pi_validation_final_mg'

cal_date_q = f"""
                select distinct to_date(cal_date) as cal_date, store_id, smic_category_id
                from {store_fdbk_table}
""" 
cal_dt_df = read_snowflake(edm_env,cal_date_q,"regular")
cal_dt_list = cal_dt_df.select(col("cal_date").cast("string")).distinct().rdd.map(lambda row : row[0]).collect()
store_list = cal_dt_df.select(col("store_id").cast("string")).distinct().rdd.map(lambda row : row[0]).collect()
cat_list = cal_dt_df.select(col("smic_category_id")).distinct().rdd.map(lambda row : row[0]).collect()

store_list_str = ",".join(store_list)

In [0]:
table_constant = '0321'
cal_dt = '2023-03-21'

In [0]:
q1 = f"call edm_features_prd.scratch_ds.s1_pi_ml_features({table_constant} VARCHAR);"
sfUtils.runQuery(ITDSreadOptions,q1)

q2 = f"call edm_features_prd.scratch_ds.s2_pi_ml_features({cal_dt} VARCHAR, {table_constant} VARCHAR);"
sfUtils.runQuery(ITDSreadOptions,q2)

q3 = f"call edm_features_prd.scratch_ds.pi_ml_input({table_constant} VARCHAR);"
sfUtils.runQuery(ITDSreadOptions,q3)

In [0]:
q = f"""
      select * from edm_features_prd.scratch_ds.pi_ml_model_input_final_0327
"""

df_all = read_snowflake(edm_env, q,"regular")

In [0]:
ps_query= f"""
             select * 
             from (
             select *,ROW_NUMBER() OVER(PARTITION BY U_CIC_CODE,U_STORE order by LAST_UPDT_TS desc) as rn 
             from scmrep.jda_inventory_store where cast(U_STORE as int) in ({store_list_str})
             ) 
             where rn=1
            """
ps_df_temp = pd.read_sql(ps_query,get_connector())
ps_df = spark.createDataFrame(ps_df_temp)

ps_df = ps_df.withColumn("U_STORE", ps_df.U_STORE.cast('int'))
df_all_ps = df_all.join(ps_df,((df_all.store_id == ps_df.U_STORE) & (df_all.corporate_item_cd == ps_df.U_CIC_CODE)), how = 'left').select(df_all["*"],ps_df['U_PS_QTY'])

In [0]:
df = df_all_ps.toPandas()

cols = ['TOTAL_PI_CHANGE', 'PI_CHANGES', 'NEG_PI_CHANGES', 'POS_PI_CHANGES',
       'PI_START', 'PI_END', 'NET_PI_CHANGE']

df[cols] = df[cols].fillna(0)
df[cols] = df[cols].apply(pd.to_numeric)

df['U_PS_QTY'] = df['U_PS_QTY'].astype('float')
df['baseline_onhand'] = df['baseline_onhand'].astype('float')
df['baseline_ps_oos'] = df[['baseline_onhand','U_PS_QTY']].apply(lambda x : 1 if x[0] <= x[1] else 0, axis = 1)
df['baseline_oos'] = df['baseline_onhand'].apply(lambda x : 1 if x == 0 else 0)

df['baseline_diff_pct'] = df['baseline_diff']/df['day_end_on_hand']
df['pi_change_pct'] = df['TOTAL_PI_CHANGE']/df['day_end_on_hand']

df['oos_res_ps'] = df[['feedback_pi','U_PS_QTY']].apply(lambda x: 1 if x[0] < x[1] else 0, axis = 1)
df['oos_alert_ps'] = df[['baseline_onhand','U_PS_QTY']].apply(lambda x: 1 if x[0] < x[1] else 0, axis = 1)

In [0]:
df.txn_dte = pd.to_datetime(df.txn_dte)
df['week_of_year'] = df.txn_dte.dt.weekofyear

df['txn_dt_str'] = pd.to_datetime(df.txn_dte).dt.strftime('%Y-%m-%d')
df = df.sample(frac=1).reset_index(drop=True)
df_sub = df[['sold_more_than_shipped','baseline_ps_oos','n_sub_sale_pct','oos_res','oos_alert','oos_alert_ps','oos_res_ps','n_dst_sale_pct','n_cat_sale_pct','n_cons_zero_sale_days','item_velocity','oos_final','pi_change_pct','week_of_year','dayofwk','txn_dt_str','feedback_pi']]
df_sub.dropna(inplace = True)
print(df_sub.shape)
df = df.iloc[df_sub.index]

df_sub[['sold_more_than_shipped','baseline_ps_oos','oos_alert','n_cons_zero_sale_days','oos_final','week_of_year']] = df_sub[['sold_more_than_shipped','baseline_ps_oos','oos_alert','n_cons_zero_sale_days','oos_final','week_of_year']].apply(pd.to_numeric)
df_sub[['n_sub_sale_pct','n_dst_sale_pct','n_cat_sale_pct','item_velocity','pi_change_pct']] = df_sub[['n_sub_sale_pct','n_dst_sale_pct','n_cat_sale_pct','item_velocity','pi_change_pct']].astype('float')

df_train = df_sub.loc[(df_sub.txn_dt_str <= '2023-03-08')]
df_test = df_sub.loc[(df_sub.txn_dt_str > '2023-03-08') & (df_sub.txn_dt_str < '2023-03-21') & (~df_sub.feedback_pi.isna())]

# X_train = df_train[['sold_more_than_shipped','baseline_ps_oos','n_sub_sale_pct','n_dst_sale_pct','pi_change_pct','week_of_year']]
X_train = df_train[['sold_more_than_shipped','baseline_ps_oos','n_sub_sale_pct','n_dst_sale_pct']]
# X_train = df_train[['oos_alert']]
y_train = df_train['oos_res']
y_train = y_train.astype('int')

# X_test = df_test[['sold_more_than_shipped','baseline_ps_oos','n_sub_sale_pct','n_dst_sale_pct','pi_change_pct','week_of_year']]
X_test = df_test[['sold_more_than_shipped','baseline_ps_oos','n_sub_sale_pct','n_dst_sale_pct']]
# X_test = df_test[['oos_alert']]
y_test = df_test['oos_res']
y_test = y_test.astype('int')
test_index = X_test.index

clf = LogisticRegression(max_iter = 1500, random_state = 26, class_weight = 'balanced')
clf.fit(X_train, y_train)

# # Predict the probabilities of the binary outcome
y_probs = clf.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test,y_probs)
f1score = (2 * precision * recall) / (precision + recall)
ix = argmax(f1score)

y_pred = (y_probs >= thresholds[ix]).astype('int')

df_pred_prob = pd.DataFrame(y_probs)
df_pred_prob.set_index(test_index, inplace = True)
df_y_test = pd.DataFrame(y_test)
df_y_test.set_index(test_index, inplace = True)
df_final = pd.concat([df.iloc[test_index],df_pred_prob], axis = 1)
l = list(df.columns)
l.extend(['pred_prob'])
df_final.columns = l
df_final['oos_pred'] = df_final['pred_prob'].apply(lambda x : (x >= thresholds[ix]).astype('int'))

# Evaluate the performance of the model using the f1 score
score = roc_auc_score(y_test, y_probs)
acc_score = accuracy_score(y_test,df_final.oos_pred)
precision = precision_score(y_test,df_final.oos_pred)
recall = recall_score(y_test,df_final.oos_pred)
precision_test = precision_score(y_test,y_pred)
f1_scr = f1_score(y_test,df_final.oos_pred)
print("f1 score ", f1_scr)
print("precision", precision)
print("recall", recall)
print("accuracy", acc_score)
print("threshold",thresholds[ix])
#   df_metrics = pd.DataFrame({'store_id' : store,'smic_category_id' : smic_category_id, 'precision' : precision, 'f1_score' : f1_scr}, index = [0])


In [0]:
df_test = df_sub.loc[df_sub.txn_dt_str == {cal_dt}]
X_test = df_test[['sold_more_than_shipped','baseline_ps_oos','n_sub_sale_pct','n_dst_sale_pct']]
test_index = X_test.index


# # Predict the probabilities of the binary outcome
y_probs = clf.predict_proba(X_test)[:, 1]
y_pred = (y_probs >= thresholds[ix]).astype('int')

df_pred_prob = pd.DataFrame(y_probs)
df_pred_prob.set_index(test_index, inplace = True)
df_final = pd.concat([df_test.iloc[test_index],df_pred_prob], axis = 1)
l = list(df_test.columns)
l.extend(['pred_prob'])
df_final.columns = l
df_final['oos_pred'] = df_final['pred_prob'].apply(lambda x : (x >= thresholds[ix]).astype('int'))
df_output = df_final.loc[df_final.oos_pred == 1]