In [None]:
# This cell is NOT editable. Overwrite variables on your own discretion.
# Any changes other than the script code will NOT BE SAVED!
# All cells are assumed to be script code cells, unless explictly tagged as 'o9_ignore'

In [None]:
_factTable = "Select ([Supplier].[Supplier Location] * [Activity2].[Activity2] * [Location].[Location] * [Documents].[OrderlineID] * [Item].[Item] * [Version].[Version Name] * [Time].[Day] ) on row,  ({Measure.[AL PO % of Total Goods Receipt TG], Measure.[AL PO % of Total Purchase Value TG], Measure.[AL PO % of Total Unique Goods Purchased TG], Measure.[AL PO Commit Creation Date TG], Measure.[AL PO Commit Delivery Date TG], Measure.[AL PO Commit Delivery Qty TG], Measure.[AL PO Goods Receipt Date TG], Measure.[AL PO Goods Receipt Purchase Value TG], Measure.[AL PO Goods Receipt Quantity TG], Measure.[AL PO Header Creation Date TG], Measure.[AL PO Net Price Per Unit TG], Measure.[AL PO Unique Goods Purchased TG]}) on column;"
_clusterItem = "Select ([Version].[Version Name] * [Item].[Item]) on row,  ({Measure.[Cluster]}) on column;"


# Initialize the O9DataLake with the input parameters and dataframes
# Data can be accessed with O9DataLake.get(<Input Name>)
# Overwritten values will not be reflected in the O9DataLake after initialization

from o9_common_utils.O9DataLake import O9DataLake, ResourceType, DataSource
factTable = O9DataLake.register("factTable",DataSource.LS, ResourceType.IBPL, _factTable)
clusterItem = O9DataLake.register("clusterItem",DataSource.LS, ResourceType.IBPL, _clusterItem)

In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.inspection import permutation_importance
from o9helpers import user_storage_path
from o9storage import cloud_storage_utils, storage_utils
import pandas as pd
import logging
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
import pickle
import os
import shutil
from sklearn.ensemble import GradientBoostingRegressor
import math

logger = logging.getLogger('o9_logger')
logger.info('===== Pickle Script for ActionButton ML Model Pickle =======')


versionName = factTable['Version.[Version Name]'].iloc[0]


logger.info("***** Version Name *******")
logger.info(versionName)

factTable = factTable[["Supplier.[Supplier Location]","Activity2.[Activity2]",
                       "Location.[Location]","Item.[Item]","Time.[Day]",
                       "AL PO Header Creation Date TG",
                       "AL PO Goods Receipt Quantity TG",
                       "AL PO Goods Receipt Date TG",
                       "AL PO Net Price Per Unit TG",
                      ]]

factTable = factTable.dropna()

factTable['AL PO Goods Receipt Date TG'] = pd.to_datetime(factTable['AL PO Goods Receipt Date TG'])
factTable['AL PO Header Creation Date TG'] = pd.to_datetime(factTable['AL PO Header Creation Date TG'])
factTable['LeadTime'] = (factTable['AL PO Goods Receipt Date TG'] - factTable['AL PO Header Creation Date TG']).dt.days

factTable['year'] = factTable['AL PO Header Creation Date TG'].dt.year
factTable['year'] = factTable['year'].astype(int)
factTable['year'] = factTable['year'].astype(str)

factTable["thanksgiving"] = factTable['year'] + "-11-26"
factTable["easter"] = factTable['year'] + "-04-12"
factTable["usIDay"] = factTable['year'] + "-07-04"
factTable["christmasDay"] = factTable['year'] + "-12-25"

list_holidays = ["thanksgiving","easter","usIDay","christmasDay"]

#please write a function for this:
for i in list_holidays:
    factTable[i] = pd.to_datetime(factTable[i])

for i in list_holidays:
    factTable["date_diff_" + i] = (factTable["AL PO Header Creation Date TG"] - factTable[i]).dt.days
    
factTable['MonthInt'] = factTable['AL PO Header Creation Date TG'].dt.month
factTable['cos_month'] = factTable['MonthInt'].apply(lambda x: math.cos(2*3.14*x/12))

factTable['Supplier.[Supplier]'] = factTable['Supplier.[Supplier Location]']

factTable = factTable.merge(clusterItem,on='Item.[Item]')

factTable = factTable.drop(['Supplier.[Supplier Location]'],axis=1)

factTable['year'] = factTable['AL PO Header Creation Date TG'].dt.year
factTable = factTable.drop(['AL PO Header Creation Date TG','AL PO Goods Receipt Date TG'],axis=1)

factTable['Time.[Day]'] = pd.to_datetime(factTable['Time.[Day]'])
factTable['Month'] = factTable['Time.[Day]'].apply(lambda x: x.strftime("%b"))
factTable = factTable.drop(["Time.[Day]"],axis=1)

#factTable = factTable[factTable["year"]==2019]

factTable1 = factTable.copy()
logger.info("***** FactTable ****")
logger.info(factTable1)

factTable1 = factTable1[factTable1.groupby("Cluster").LeadTime.transform(lambda x : (x<x.quantile(0.80))&(x>(x.quantile(0.10)))).eq(1)]

factTable1 = factTable1.rename(columns={'AL PO Net Price Per Unit TG':'Price',
                                        'AL PO Goods Receipt Quantity TG':'Quantity'})


X = factTable1.copy()

####################### WITHOUT PRICE MODEL #######################

X1 = X[["Supplier.[Supplier]", 
        "cos_month",
        "Cluster",
        "Activity2.[Activity2]",
        "Location.[Location]",
        'Quantity',
        'date_diff_thanksgiving',
        'date_diff_easter',
        'date_diff_usIDay',
        'date_diff_christmasDay']]

y = X[['LeadTime']]

X_train, X_test, y_train, y_test = train_test_split(
    X1, y, random_state=42)

numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
     ])

categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numeric_features = ['cos_month','Cluster','Quantity','date_diff_thanksgiving',
        'date_diff_easter',
        'date_diff_usIDay',
        'date_diff_christmasDay']

categorical_features = ["Supplier.[Supplier]","Activity2.[Activity2]","Location.[Location]"]



preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 


percentileList = [50,75,95]

### For ML Model #####

for i in percentileList:
    logger.info("***** Training for Percentile *******")
    logger.info(i)
    model = Pipeline([('preprocessor', preprocessor),('classifier', GradientBoostingRegressor(loss="quantile",
                                        n_estimators=500, 
                                        learning_rate=0.1,
                                        max_depth=5, 
                                        random_state=0,
                                        alpha=(i/100)))])    
    model.fit(X1,y)

    bucket = "MLPipelineModelNOPrice" + str(i)
    local_storage_path = os.path.join(user_storage_path, bucket)
    test_folder_path = os.path.join(local_storage_path, "test")
    os.makedirs(test_folder_path)
    model_path = os.path.join(test_folder_path, 'MLPipelineModelNOPrice'+str(i)+'.pkl')
    
    logger.info("****** PICKLE FILE PATH *******")
    logger.info(model_path)
    joblib.dump(model, open(model_path, 'wb'))
    logger.info("folder contents: {}".format(os.listdir(test_folder_path)))

    #Storage push
    logger.debug('********Before storage_push*******')
    logger.debug(os.listdir(local_storage_path))
    value = storage_utils.storage_push(bucket, local_storage_path, overwrite=True)

    if value:
        logger.debug("storage_push successful")
    else:
        logger.debug("storage_push failed")
        shutil.rmtree(test_folder_path)


    #Storage pull
    logger.debug('********Before storage_pull*******')
    logger.debug(os.listdir(local_storage_path))
    value = storage_utils.storage_pull(bucket, local_storage_path, overwrite=True)


    if value:
        logger.debug("storage_pull successful")
    else:
        logger.debug("storage_pull failed")

    logger.debug('********After storage_pull*******')
    logger.info(os.listdir(test_folder_path))


    loaded_model = joblib.load(open(model_path, 'rb'))
    result = loaded_model.score(X,y)
    logger.info("********** Result from Stored Pickle File**********")
    logger.info(result)
    logger.info("****** PICKLE FILE NUMBER  **********" +  str(i))



####################### WITH PRICE MODEL #######################

X1 = X[["Supplier.[Supplier]", 
        "cos_month",
        "Cluster",
        "Activity2.[Activity2]",
        "Location.[Location]",
        'Price',
        'Quantity',
        'date_diff_thanksgiving',
        'date_diff_easter',
        'date_diff_usIDay',
        'date_diff_christmasDay']]

y = X[['LeadTime']]

X_train, X_test, y_train, y_test = train_test_split(
    X1, y, random_state=42)

numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
     ])

categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numeric_features = ['cos_month','Cluster','Price','Quantity','date_diff_thanksgiving','date_diff_easter',
        'date_diff_usIDay',
        'date_diff_christmasDay']

categorical_features = ["Supplier.[Supplier]","Activity2.[Activity2]","Location.[Location]"]



preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 



percentileList = [50,75,95]

### For ML Model #####

for i in percentileList:
    logger.info("***** Training for Percentile *******")
    logger.info(i)
    model = Pipeline([('preprocessor', preprocessor),('classifier', GradientBoostingRegressor(loss="quantile",
                                        n_estimators=500, 
                                        learning_rate=0.1,
                                        max_depth=5, 
                                        random_state=0,
                                        alpha=(i/100)))]) 

    model.fit(X1,y)

    bucket = "MLPipelineModelPrice" + str(i)
    local_storage_path = os.path.join(user_storage_path, bucket)
    test_folder_path = os.path.join(local_storage_path, "test")
    os.makedirs(test_folder_path)
    model_path = os.path.join(test_folder_path, 'MLPipelineModelPrice'+str(i)+'.pkl')
    
    logger.info("****** PICKLE FILE PATH *******")
    logger.info(model_path)
    joblib.dump(model, open(model_path, 'wb'))
    logger.info("folder contents: {}".format(os.listdir(test_folder_path)))

    #Storage push
    logger.debug('********Before storage_push*******')
    logger.debug(os.listdir(local_storage_path))
    value = storage_utils.storage_push(bucket, local_storage_path, overwrite=True)

    if value:
        logger.debug("storage_push successful")
    else:
        logger.debug("storage_push failed")
        shutil.rmtree(test_folder_path)


    #Storage pull
    logger.debug('********Before storage_pull*******')
    logger.debug(os.listdir(local_storage_path))
    value = storage_utils.storage_pull(bucket, local_storage_path, overwrite=True)


    if value:
        logger.debug("storage_pull successful")
    else:
        logger.debug("storage_pull failed")

    logger.debug('********After storage_pull*******')
    logger.info(os.listdir(test_folder_path))


    loaded_model = joblib.load(open(model_path, 'rb'))
    result = loaded_model.score(X,y)
    logger.info("********** Result from Stored Pickle File**********")
    logger.info(result)
    logger.info("****** PICKLE FILE NUMBER  **********" +  str(i))

modelpercentileList = [50]

for i in modelpercentileList :
    logger.info("***** Training for Percentile *******")
    logger.info(i)
    model = Pipeline([('preprocessor', preprocessor),('classifier', GradientBoostingRegressor(loss="quantile",
                                        n_estimators=500, 
                                        learning_rate=0.1,
                                        max_depth=5, 
                                        random_state=0,
                                        alpha=(i/100)))]) 

    model.fit(X1,y)

new_df = pd.DataFrame()

result = permutation_importance(model, X1, y, n_repeats=10,random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()

dfImportance = pd.DataFrame(data=result.importances[sorted_idx].T,columns=X1.columns[sorted_idx])
df1 = pd.DataFrame(data=dfImportance.mean()).reset_index()

df1['imp'] = df1[0]

df1['imp1'] = df1['imp']/(df1['imp'].sum())

featureImportanceTable = df1.copy()

featureImportanceTable["feature_new"] = featureImportanceTable["index"].apply(lambda x: x.split("_")[0])
new_df = featureImportanceTable.groupby("feature_new").agg({"imp1":"sum"}).reset_index()
featureImportanceTable = new_df.sort_values("imp1", ascending=False)
featureImportanceTable = featureImportanceTable.reset_index()

featureImportanceTable = featureImportanceTable.replace(to_replace ="date",value ="Holidays")

featureImportanceTable = featureImportanceTable.drop(['index'],axis=1)

new_df = featureImportanceTable.copy()

new_df["Version.[Version Name]"] = versionName
#new_df = new_df.drop(["index"],axis=1)


new_df['Predictor.[Predictor]'] = np.arange(new_df.shape[0])
new_df['Predictor.[Predictor]'] = new_df['Predictor.[Predictor]'].apply(lambda x: "P" + str(x+1))
new_df = new_df.rename(columns={"feature_new":"PredictorName","imp1":"FeatureImportance"})



In [None]:
logger.info("==============DATAFRAME==========")
new_df["[AIModel].[AIModel]"] = "Individual PO Lead Time Prediction"
new_df = new_df.replace({"Cluster": "Item Cluster","Supplier.[Supplier]":"Supplier","Location.[Location]":"Location","Activity2.[Activity2]":"TransMode","cos_month":"Month"})

logger.info(new_df.columns)

new_df = new_df[["Version.[Version Name]","[AIModel].[AIModel]","Predictor.[Predictor]","PredictorName","FeatureImportance"]]
logger.info(new_df)