### Objective
The objective of the notebook is to calculate the seasonality components by calculated approach at the level provided in the config(data_processing > feature_engineering > calculation_based > higher_level_si_creation)

In [0]:
import yaml
import glob
import numpy as np
import pandas as pd
from distutils.command.config import config
from datetime import datetime
import time
from time import strftime
import os
import dotsi
import calendar

In [0]:
%run ../0_Config.ipynb

In [0]:
# Create the algo directory for storing the results
output_directory = app_config['output_dir_path']
root_dir = "Data_Processing"
category = "Seasonality_Index"
algo_path = os.path.join(output_directory,root_dir,category)
if not os.path.exists(algo_path):
    os.makedirs(algo_path)
    print(algo_path)

#### Broadcasting the required variables
Variables suffixed with "_conf" are taken from the config file

In [0]:
granularity_conf = app_config["data_processing"]['feature_engineering']['calculation_based']["higher_level_si_creation"]["granularity"]
# print(granularity_conf)

# Rename Start date and DV config
dv_config = app_config["data_processing"]['feature_engineering']['calculation_based']["higher_level_si_creation"]["sales_variable"]
ds_config = app_config["date_var"]
  
broadcast_granularity = dotsi.Dict({"value":granularity_conf})
broadcast_sales_var = dotsi.Dict({"value":dv_config})

In [0]:
def create_SI(si_df: pd.DataFrame)-> pd.DataFrame:
    """Function to calculate seasonality index at different levels

    Parameters
    ----------
    si_df : pd.DataFrame
        The dataset containing values for all the required variables

    Returns
    -------
    pd.DataFrame
      Returns a dataframe with the granularity,date and si at different levels
    """
    try:
        groupby_cols = broadcast_granularity.value
        frequency = broadcast_frequency.value
        
        dates_nunique = si_df['ds'].nunique()
        if((frequency == "D") & (dates_nunique < 365)):
            raise Exception("there should be atleast one year of data but the data has "+ str(dates_nunique) + " days only.")
        elif(('W-' in frequency) & (dates_nunique < 52)):
            raise Exception("there should be atleast one year of data but the data has "+ str(dates_nunique) + " weekly only.")
        elif((frequency == "M") & (dates_nunique < 12)):
            raise Exception("there should be atleast one year of data but the data has "+ str(dates_nunique) + " months only.")
            
        #Yearly normalized/mean
        si_df_temp = si_df.groupby(['Year']).agg(mean_sales = ('y',np.mean)).reset_index()
        si_df = pd.merge(si_df,si_df_temp , how ='left')
        si_df['SI']=si_df['y']/si_df['mean_sales']
        
        si_df_fin = pd.DataFrame()
        #level normalized/mean
        if(frequency == "D"):
            si_df_daily = si_df.groupby(['Day']).agg(cal_si_daily = ("SI",np.mean)).reset_index()
            si_df_weekly = si_df.groupby(['Week']).agg(cal_si_weekly = ("SI",np.mean)).reset_index()
            si_df_fin = pd.merge(si_df,si_df_daily,on='Day',how='left')
            si_df_fin = pd.merge(si_df_fin,si_df_weekly,on='Week',how='left')
            
        elif('W-' in frequency):
            si_df_weekly = si_df.groupby(['Week']).agg(cal_si_weekly = ("SI",np.mean)).reset_index()
            si_df_fin = pd.merge(si_df,si_df_weekly,on='Week',how='left')
        
        si_df_monthly = si_df.groupby(['Month']).agg(cal_si_monthly = ("SI",np.mean)).reset_index()
        si_df_qtyly = si_df.groupby(['Qtr']).agg(cal_si_qtrly = ("SI",np.mean)).reset_index()
        si_df_fin = pd.merge(si_df_fin,si_df_monthly,on='Month',how='left')
        si_df_fin = pd.merge(si_df_fin,si_df_qtyly,on='Qtr',how='left')
        si_df_fin.drop(columns = ['SI','mean_sales'],inplace = True)
        si_df_fin = si_df_fin.sort_values(by = ['ds'],ascending = True).reset_index(drop=True)
        
        # To adhere to defined schema
        for x in groupby_cols:   
            si_df_fin[x] = si_df_fin[x].astype(str)
            
        si_df_fin['status'] = 'success'
        return si_df_fin
    
    except Exception as e:
        if(frequency == "D"):
            results_pd = pd.DataFrame(columns = [['ds','y','Day','Week','Month','Qtr','Year','cal_si_daily','cal_si_weekly','cal_si_monthly','cal_si_qtrly','status']+\
                                                 broadcast_granularity.value],index = range(1))
        elif('W-' in frequency):
            results_pd = pd.DataFrame(columns = [['ds','y','Week','Month','Qtr','Year','cal_si_weekly','cal_si_monthly','cal_si_qtrly','status']+\
                                                 broadcast_granularity.value],index = range(1))
        elif(frequency == 'M'):
            results_pd = pd.DataFrame(columns = [['ds','y','Month','Qtr','Year','cal_si_monthly','cal_si_qtrly','status']+\
                                                 broadcast_granularity.value],index = range(1))
        
        results_pd[broadcast_granularity.value] = si_df[broadcast_granularity.value].head(1).reset_index(drop = True)
        for x in broadcast_granularity.value:
              results_pd[x] = results_pd[x].astype(str)
        results_pd['status'] = str(e)
        
        return results_pd

In [0]:
def data_loading(path):
    if os.path.isdir(path):  
        all_files = os.listdir(path)
        all_files_ext = []
        for file_name in all_files:
            all_files_ext.append(os.path.splitext(file_name)[1])

        if(".csv" in all_files_ext):
            df = spark.read.format("csv").options(header = "True",inferSchema='True').load(path.replace("/dbfs",""))
            return df
        elif(".parquet" in all_files_ext):
            if("_delta_log" in all_files):
                df = spark.read.format("delta").load(path.replace("/dbfs",""))
            else:
                df = spark.read.format("parquet").load(path.replace("/dbfs",""))
            return df
        else:
            assert False, "Only .csv or .parquet file types are supported"

    elif os.path.isfile(path):  
        file_type = os.path.splitext(path)[1]
        if(".csv" == file_type):
            df = spark.read.format("csv").options(header = "True",inferSchema='True').load(path.replace("/dbfs",""))
            return df
        elif(".parquet" == file_type):
            df = spark.read.format("parquet").load(path.replace("/dbfs",""))
            return df
        else:
            assert False, "Only .csv or .parquet file types are supported"
    else:  
        assert False, 'Path specified is not correct'

#### Load data

In [0]:
if(app_config["data_processing"]["outlier_treatment_needed"] == True):
    # Reading the latest input file based on timestamp
    all_files = [file for file in os.listdir(app_config['output_dir_path']+"/Data_Processing/Outlier_treatment")]
    outlier_op_files = [file for file in all_files if "Outlier_treatment_results (" in file]
    outlier_op_files = [file.replace(".csv","") for file in outlier_op_files]
    version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in outlier_op_files]
    max_date = max(version_dates)
    max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
    req_file_name = [x for x in outlier_op_files if max_date in x]
    outlier_op_file_path = os.path.join(app_config['output_dir_path']+"/Data_Processing/Outlier_treatment",req_file_name[0] + ".csv")
    # print(outlier_op_file_path)

    # Reading the data
    df = pd.read_csv(outlier_op_file_path)
else:
    df = data_loading(app_config["input_file_path"])
# print(df.shape)

df.rename(columns = {ds_config:"ds", dv_config:"y"}, inplace = True)
df['ds'] = pd.to_datetime(df['ds'],format = app_config["date_format_pandas"])

start_date= app_config["data_processing"]['feature_engineering']['calculation_based']["higher_level_si_creation"]["start_date"]
end_date= app_config["data_processing"]['feature_engineering']['calculation_based']["higher_level_si_creation"]["end_date"]

if((len(start_date)>1) & (len(end_date)>1)):
    start_date = datetime.strptime(start_date, app_config["date_format_pandas"])
    end_date = datetime.strptime(end_date, app_config["date_format_pandas"])
    print(df.shape)
    df = df[(df['ds']>=start_date) & (df['ds']<=end_date)]

# Aggregating the data at granularity level
df1 = df.groupby(granularity_conf + ['ds']).agg(y = ('y','sum')).reset_index()

df1[granularity_conf] = df1[granularity_conf].astype(str)

In [0]:
# Infering frequency
pandas_df = df1[['ds']].drop_duplicates()
history_dates = pd.to_datetime(pd.Series(pandas_df['ds'].unique(), name='ds')).sort_values()
frequency = pd.infer_freq(history_dates.tail(3))
broadcast_frequency = dotsi.Dict({"value":frequency})

# Date-Week mapping
start_date = datetime.strptime(str(pandas_df['ds'].min().year) + '-01-01', '%Y-%m-%d')
end_date =   datetime.strptime(str(pandas_df['ds'].max().year) + '-12-31', '%Y-%m-%d') 
# print(frequency,start_date,end_date)

total_weeks_df = pd.DataFrame(pd.date_range(start_date, end_date, freq=frequency, name="ds"))
total_weeks_df = total_weeks_df.copy()

total_weeks_df['Qtr'] = total_weeks_df['ds'].dt.quarter
total_weeks_df["Year"] = total_weeks_df['ds'].dt.year

if(frequency == 'D'):
    total_weeks_df['Day'] = total_weeks_df['ds'].dt.dayofyear
    total_weeks_df['Week'] = total_weeks_df['ds'].dt.strftime( '%U').astype(int)
elif('W-' in frequency):
    total_weeks_df['Week']=1
    for i in range(1,total_weeks_df.shape[0]):
        if (total_weeks_df.loc[i,'Year']==total_weeks_df.loc[i-1,'Year']):
            total_weeks_df.loc[i,'Week']=total_weeks_df.loc[i-1,'Week']+1
    
total_weeks_df['Month'] = total_weeks_df['ds'].dt.month
total_weeks_df['Qtr'] = total_weeks_df['ds'].dt.quarter
total_weeks_df["Year"] = total_weeks_df['ds'].dt.year
    
si_df= df1.merge(total_weeks_df, on="ds",how='left')

si_df['gran_tempp'] = si_df[granularity_conf].astype(str).sum(axis=1)
unique_pdts = si_df['gran_tempp'].unique()
fin_si_df = pd.DataFrame()
for pdt in unique_pdts:
	fin_si_df = pd.concat([fin_si_df,create_SI(si_df[si_df['gran_tempp']==pdt])])
# display(fin_si_df)

Div_No,ds,y,Qtr,Year,Week,Month,gran_tempp,cal_si_weekly,cal_si_monthly,cal_si_qtrly,status
24,2018-08-08T00:00:00.000+0000,53.8,3,2018,32,8,24.0,0.8477744828171495,1.1633319850128947,1.066847870635239,success
24,2018-08-15T00:00:00.000+0000,49.02,3,2018,33,8,24.0,1.0876549682223404,1.1633319850128947,1.066847870635239,success
24,2018-08-22T00:00:00.000+0000,60.97,3,2018,34,8,24.0,1.3339413250777847,1.1633319850128947,1.066847870635239,success
24,2018-08-29T00:00:00.000+0000,67.64,3,2018,35,8,24.0,1.3839571639343042,1.1633319850128947,1.066847870635239,success
24,2018-09-05T00:00:00.000+0000,82.3,3,2018,36,9,24.0,1.05008237423429,1.1390087695211772,1.066847870635239,success
24,2018-09-12T00:00:00.000+0000,104.63,3,2018,37,9,24.0,1.257605647543043,1.1390087695211772,1.066847870635239,success
24,2018-09-19T00:00:00.000+0000,89.38,3,2018,38,9,24.0,1.0802927761272385,1.1390087695211772,1.066847870635239,success
24,2018-09-26T00:00:00.000+0000,91.59,3,2018,39,9,24.0,1.2070642485170229,1.1390087695211772,1.066847870635239,success
24,2018-10-03T00:00:00.000+0000,62.57,4,2018,40,10,24.0,1.1104140012329569,0.9837418610772346,0.8246227300886545,success
24,2018-10-10T00:00:00.000+0000,89.51,4,2018,41,10,24.0,0.9915992657527748,0.9837418610772346,0.8246227300886545,success


#### Exporting Seasonality Index results

In [0]:
fin_si_df.to_csv(algo_path+"/SI_results ("+datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+").csv", index = False)