In [ ]:
# This cell is NOT editable. Overwrite variables on your own discretion.
# Any changes other than the script code will NOT BE SAVED!
# All cells are assumed to be script code cells, unless explictly tagged as 'o9_ignore'

In [ ]:
_sales = "select ([WalmartTime].[Day] * [Version].[Version Name].[CurrentWorkingView]  * [Department].[Department_ID] * [Store].[Store_ID] * {Measure.[Weekly Sales]}  );"
_stores = "select([Store].[Store_ID] * [Store].[Type]  * [Store].[Size]);"
_features = "select( [WalmartTime].[Day] * [Version].[Version Name].[CurrentWorkingView] * [Store].[Store_ID] * { Measure.[Temperature], Measure.[Fuel Price], Measure.[MarkDown1] , Measure.[MarkDown2], Measure.[MarkDown3], Measure.[MarkDown4] , Measure.[MarkDown5] , Measure.[CPI] , Measure.[Unemployment] } );"


# Initialize the O9DataLake with the input parameters and dataframes
# Data can be accessed with O9DataLake.get(<Input Name>)
# Overwritten values will not be reflected in the O9DataLake after initialization

from o9_common_utils.O9DataLake import O9DataLake, ResourceType, DataSource,PluginSetting
sales = O9DataLake.register("sales",data_source = DataSource.LS, entity_type = ResourceType.IBPL, query = _sales,plugin_setting = PluginSetting.Inputs)
stores = O9DataLake.register("stores",data_source = DataSource.LS, entity_type = ResourceType.IBPL, query = _stores,plugin_setting = PluginSetting.Inputs)
features = O9DataLake.register("features",data_source = DataSource.LS, entity_type = ResourceType.IBPL, query = _features,plugin_setting = PluginSetting.Inputs)

O9DataLake.register("PredictedSales",data_source = DataSource.LS,entity_type = ResourceType.IBPL,plugin_setting = PluginSetting.Outputs)
script_params = O9DataLake.register({}, data_source = DataSource.LS,plugin_setting = PluginSetting.ScriptParam)

In [ ]:
import os
import numpy as np
import pandas as pd
import logging 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

logger = logging.getLogger('o9_logger')
fasddsaf
sales_df = sales[['WalmartTime.[Day]', 'Department.[Department_ID]','Store.[Store_ID]','Weekly Sales']]

features_df = features[['Store.[Store_ID]','WalmartTime.[Day]', 'Temperature','Fuel Price','MarkDown1','MarkDown2',
          'MarkDown3', 'MarkDown4','MarkDown5', 'CPI', 'Unemployment']]

stores_df = stores[['Store.[Store_ID]', 'Store.[Type]', 'Store.[Size]']]

dataset = sales_df.merge(stores_df, how='left').merge(features_df, how='left')

input_df = dataset
logger.error("LOGGING INPUT DATAFRAME")
logger.info(input_df.head())

input_df.fillna(0, inplace=True)
input_df = pd.get_dummies(input_df, columns=["Store.[Type]"])
#input_df.drop("Store.[Type]_C", axis=1, inplace=True)
input_df_scaled = input_df

# Normalize the independent variables
min_max_scaler = preprocessing.MinMaxScaler()
input_df_scaled[['Store.[Size]', 'Temperature', 'Fuel Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']] = min_max_scaler.fit_transform(input_df_scaled[['Store.[Size]', 'Temperature', 'Fuel Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']])

logging.info("Dataframe after cleanup")
logging.info(input_df_scaled.head())

X = input_df_scaled.loc[:, ['Store.[Size]', 'Temperature', 'Fuel Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
       'Unemployment', 'Store.[Type]_A', 'Store.[Type]_B']]
y = input_df_scaled[['Weekly Sales']]

DTReg = DecisionTreeRegressor(max_depth=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,test_size=0.25, shuffle=False )
DTReg.fit(X_train, y_train)   # training the model
test_pred = DTReg.predict(X_test)  # making predictions
mse = mean_absolute_error(y_test, test_pred)

logging.warning("Mean abosulte error: {}".format(mse))

train_pred = DTReg.predict(X_train)
all_predictions = np.hstack( (train_pred,test_pred) )

PredictedSales= sales[['Version.[Version Name]', 'WalmartTime.[Day]', 'Department.[Department_ID]','Store.[Store_ID]']]
PredictedSales["Predicted Sales"] = all_predictions

logger.error("Ingesting data")
logger.error(PredictedSales.dtypes)

PredictedSales["Department.[Department_ID]"] = PredictedSales["Department.[Department_ID]"].astype('string')
PredictedSales["Store.[Store_ID]"] = PredictedSales["Store.[Store_ID]"].astype('string') 

logger.error(PredictedSales.dtypes)
logging.error("Finishing Plugin Execution")