In [4]:
_sales = "select ([WalmartTime].[Day] * [Version].[Version Name].[CurrentWorkingView]  * [Department].[Department_ID] * [Store].[Store_ID] * {Measure.[Weekly Sales]}); "
_features = "select( [WalmartTime].[Day] * [Version].[Version Name].[CurrentWorkingView] * [Store].[Store_ID] * { Measure.[Temperature], Measure.[Fuel Price], Measure.[MarkDown1] , Measure.[MarkDown2], Measure.[MarkDown3], Measure.[MarkDown4] , Measure.[MarkDown5] , Measure.[CPI] , Measure.[Unemployment]});"
_stores = "select([Store].[Store_ID] *[Store].[Type] *[Store].[Size]);"


# Initialize the O9DataLake with the input parameters and dataframes
# Data can be accessed with O9DataLake.get(<Input Name>)
# Overwritten values will not be reflected in the O9DataLake after initialization

from o9_common_utils.O9DataLake import O9DataLake, ResourceType, DataSource,PluginSetting
sales = O9DataLake.register("sales",data_source = DataSource.LS, entity_type = ResourceType.IBPL, query = _sales,plugin_setting = PluginSetting.Inputs)
features = O9DataLake.register("features",data_source = DataSource.LS, entity_type = ResourceType.IBPL, query = _features,plugin_setting = PluginSetting.Inputs)
stores = O9DataLake.register("stores",data_source = DataSource.LS, entity_type = ResourceType.IBPL, query = _stores,plugin_setting = PluginSetting.Inputs)
O9DataLake.register("Version.[Version Name]", data_source = DataSource.LS, entity_type = ResourceType.IBPL,plugin_setting = PluginSetting.SliceDimension)

O9DataLake.register("PredictedSales",data_source = DataSource.LS,entity_type = ResourceType.IBPL,plugin_setting = PluginSetting.Outputs)
script_params = O9DataLake.register({}, data_source = DataSource.LS,plugin_setting = PluginSetting.ScriptParam)

In [5]:
O9DataLake.inputs

{'sales': {'name': 'sales',
  'resource_type': <ResourceType.IBPL: 'ibpl_query'>,
  'data_source': <DataSource.LS: 'liveserver'>,
  'query': 'select ([WalmartTime].[Day] * [Version].[Version Name].[CurrentWorkingView]  * [Department].[Department_ID] * [Store].[Store_ID] * {Measure.[Weekly Sales]}); ',
  'std_count_limit': '200000',
  'df':        WalmartTime.[Day] Version.[Version Name]  Department.[Department_ID]  \
  0             02-05-2010     CurrentWorkingView                           1   
  1             02-05-2010     CurrentWorkingView                           1   
  2             02-05-2010     CurrentWorkingView                           1   
  3             02-05-2010     CurrentWorkingView                           1   
  4             02-05-2010     CurrentWorkingView                           1   
  ...                  ...                    ...                         ...   
  421565        10-26-2012     CurrentWorkingView                          50   
  421566    

In [6]:
# Import packages  
import os
import numpy as np  
import pandas as pd  
import logging   
from sklearn import preprocessing  
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeRegressor  
from sklearn.metrics import mean_absolute_error  
 
# Initializing logger 
logger = logging.getLogger('o9_logger')  
  
sales_df = sales[['WalmartTime.[Day]', 'Department.[Department_ID]','Store.[Store_ID]','Weekly Sales']]  
features_df = features[['Store.[Store_ID]','WalmartTime.[Day]', 'Temperature','Fuel Price','MarkDown1','MarkDown2',  
          'MarkDown3', 'MarkDown4','MarkDown5', 'CPI', 'Unemployment']]  
  
stores_df = stores[['Store.[Store_ID]', 'Store.[Type]', 'Store.[Size]']]  
  
dataset = sales_df.merge(stores_df, how='left').merge(features_df, how='left')  
  
# Input dataframe
input_df = dataset  
logger.error("LOGGING INPUT DATAFRAME")  
logger.info(input_df.head())  
  
input_df.fillna(0, inplace=True)  
input_df = pd.get_dummies(input_df, columns=["Store.[Type]"])  
input_df.drop("Store.[Type]_C", axis=1, inplace=True)  
input_df_scaled = input_df  
  
# Normalize the independent variables  
min_max_scaler = preprocessing.MinMaxScaler()  
input_df_scaled[['Store.[Size]', 'Temperature', 'Fuel Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']] = min_max_scaler.fit_transform(input_df_scaled[['Store.[Size]', 'Temperature', 'Fuel Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']])  
  
logging.info("Dataframe after cleanup")  
logging.info(input_df_scaled.head())  
  
X = input_df_scaled.loc[:, ['Store.[Size]', 'Temperature', 'Fuel Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',  
       'Unemployment', 'Store.[Type]_A', 'Store.[Type]_B']]  
y = input_df_scaled[['Weekly Sales']]  
  
DTReg = DecisionTreeRegressor(max_depth=5)  
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,test_size=0.25, shuffle=False )  
DTReg.fit(X_train, y_train)   # training the model  
test_pred = DTReg.predict(X_test)  # making predictions  
mse = mean_absolute_error(y_test, test_pred)  
  
logging.warning("Mean absolute error: {}".format(mse))  
  
train_pred = DTReg.predict(X_train)  
all_predictions = np.hstack( (train_pred,test_pred) )  
  
PredictedSales= sales[['Version.[Version Name]', 'WalmartTime.[Day]', 'Department.[Department_ID]','Store.[Store_ID]']]  
PredictedSales["Predicted Sales"] = all_predictions  
  
logging.info("Finishing Plugin Execution")  


2024-07-16 10:26:18,134 - o9_logger - ERROR  - LOGGING INPUT DATAFRAME
2024-07-16 10:26:18,135 - o9_logger - INFO  -   WalmartTime.[Day]  Department.[Department_ID]  Store.[Store_ID]  \
0        02-05-2010                           1                 1   
1        02-05-2010                           1                10   
2        02-05-2010                           1                11   
3        02-05-2010                           1                12   
4        02-05-2010                           1                13   

   Weekly Sales Store.[Type]  Store.[Size]  Temperature  Fuel Price  \
0      24924.50            A        151315        42.31       2.572   
1      40212.84            B        126512        57.65       3.963   
2      19611.13            A        207499        84.50       2.653   
3      17426.75            B        112238        64.05       4.121   
4      46761.90            A        219622        82.27       2.797   

   MarkDown1  MarkDown2  MarkDown3  MarkD

