In [None]:
# 0-importing necessary packages

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from datetime import datetime
import statsmodels.api as sm
from pycaret.regression import *
import xgboost as xgb
import catboost as ctb
from sklearn.metrics import mean_squared_error, mean_absolute_error
#!pip install imageio
#import imageio
from statsmodels.graphics.tsaplots import plot_acf
print('Importing libraries: Done')

In [None]:
# 1-Inputs operation

# checking inputs
print("Folder's files : ",os.listdir('inputs'), '\n', '_________','\n')

# Reading input CSV files and assigning a name to each one of them 
dataset = pd.read_csv("inputs/train.csv", names=['Store','Dept','Date','weeklySales','isHoliday'],sep=',', header=0)
features = pd.read_csv("inputs/features.csv",sep=',', header=0,names=['Store','Date','Temperature','Fuel_Price','MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5','CPI','Unemployment','IsHoliday']).drop(columns=['IsHoliday'])
stores = pd.read_csv("inputs/stores.csv", names=['Store','Type','Size'],sep=',', header=0)

# Making needed directories
os.makedirs('temp_test', exist_ok=True)
os.makedirs('input_analysis', exist_ok=True)
os.makedirs('pred_output', exist_ok=True)
os.makedirs('output_analysis', exist_ok=True)

# Flating data(merging different data bases into one table)
dataset = dataset.merge(stores, how='left').merge(features, how='left')  
print('Original dataset sample:', '\n', dataset.tail(5))

In [None]:
#Decreasing unnecessary memory usage 
dataset['Store'] = dataset['Store'].astype('int16')
dataset['Dept'] = dataset['Dept'].astype('int16')
dataset['weeklySales'] = dataset['weeklySales'].astype('float64')

In [None]:
# 2-Data extraction

# Deriving a sub-dataset number 1 from flat dataset
dataset_sub1 = dataset[['Date','Store','Dept','weeklySales']]
dataset_sub1 = dataset_sub1.sort_index(axis=0)
dataset_sub1.tail(5)

In [None]:
#getting list of unique departments' values
dept_list = dataset_sub1['Dept'].unique()
dept_list.sort()
dept_list

In [None]:
#getting list of unique stores' values
store_list = dataset_sub1['Store'].unique()
store_list.sort()
store_list

In [None]:
#getting list of unique dates
date_list = dataset_sub1['Date'].unique()
date_list.sort()
date_list

In [None]:
# this function removes departmets in different stores which are not complete or has below 0 sales value
def outlier_identifier(df, border_value, store_list, dept_list):
    data_map = pd.DataFrame(columns=['store', 'dept', 'number_of_entries', 'target_false_count', 'outlier_flag'])
    for i in store_list:
        for j in dept_list:
            number_of_entries = df[(df.Store == i) & (df.Dept == j)].Date.count()
            number_of_entries = number_of_entries.astype('int16')
            target_false_count = df[(df.weeklySales <= border_value) & (df.Store == 
                                                                        i) & (df.Dept == j)].weeklySales.count()
            target_false_count = target_false_count.astype('int16')
            if (number_of_entries == 143) & (target_false_count == 0):
                outlier_flag = 0
            else:
                outlier_flag = 1
            new_row = {'store': i, 'dept': j, 'number_of_entries': number_of_entries, 'target_false_count': target_false_count, 'outlier_flag': outlier_flag}
            data_map.loc[len(data_map)] = new_row
    return data_map

In [None]:
data_map = outlier_identifier(dataset_sub1, 0, store_list, dept_list)
data_map

In [None]:
print('outlier percentage:', data_map[data_map.outlier_flag == 1].store.count() / 3645 , '\n')

In [None]:
#making a mapping dataframe to know which store and department mix should be droped
removal_map = data_map[['store','dept','outlier_flag']]
removal_map = removal_map[removal_map.outlier_flag == 1]
removal_map.reset_index(drop=True, inplace=True)

In [None]:
def outlier_remover(df, removal_map):
    for i in removal_map.index:
        a = removal_map.iloc[[i]].store
        a.reset_index(drop=True, inplace=True)
        b = removal_map.iloc[[i]].dept
        b.reset_index(drop=True, inplace=True)
        print('Store:', a[0], 'Department:', b[0],' Removed as outlier!','\n')
        index = df[(df.Store == a[0]) & (df.Dept == b[0])].index
        for j in index:
            df.drop(j , inplace=True)
#            print('droped index:', j)
    return df

In [None]:
dataset_sub2 = outlier_remover(dataset_sub1, removal_map)
dataset_sub2

In [None]:
dataset_sub2.to_csv('temp_test/dataset_sub2.csv') #outlier removing is time consuming, therefore we save it and skip cleaning next time

In [None]:
dataset_sub2 = pd.read_csv("temp_test/dataset_sub2.csv", names=['Date','Store','Dept','weeklySales'],sep=',', header=0)

In [None]:
#getting list of unique departments' values
dept_list = dataset_sub2['Dept'].unique()
dept_list = dept_list.sort()
dept_list

In [None]:
mdls = ['dt', 'rf', 'et', 'xgboost', 'catboost', 'lightgbm', 'gbr', 'huber', 'ada', 'par', 'omp', 'en', 'lasso', 'llar', 'br', 'ridge', 'lar', 'lr']
# excluded: 'dummy', 'knn'
#mdls = ['ada','dt']
#dept_list = [1,2]

In [None]:
#This function slices input dataframe acording to values of a column (Test:OK)
def slicer(df, dept):
    x = df[df['Dept'] == dept].copy()
    x = x.drop(['Dept'], axis=1)
    return x 


#This function (Test:OK)
def aggregator(df):
    aggr = df.groupby('Date', as_index=True).sum()
    aggr.index = pd.to_datetime(aggr.index)
    return aggr

#This function (Test:OK)
def create_features(df):
    df['Date'] = df.index
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['dayofmonth'] = df['Date'].dt.day
    df['weekofyear'] = df['Date'].dt.weekofyear
    df['dayofweek'] = df['dayofweek'].astype('int16')
    df['quarter'] = df['quarter'].astype('int16')
    df['month'] = df['month'].astype('int16')
    df['year'] = df['year'].astype('int16')
    df['dayofyear'] = df['dayofyear'].astype('int16')
    df['dayofmonth'] = df['dayofmonth'].astype('int16')
    df['weekofyear'] = df['weekofyear'].astype('int16')
    X = df[['dayofweek','quarter','month','year','dayofyear','dayofmonth','weekofyear', 'weeklySales']]
    X.index = df.index
    return X

#This function (Test:OK)
def split_data(df, split_date):
    return df[df.index < split_date].copy(), \
            df[df.index >= split_date].copy()

#This function plots test and train values of target in time (Test:OK)
def plt_test_train(df_train, df_test):
    plt.figure(figsize = (20,10))
    plt.xlabel('date')
    plt.ylabel('weekly sales')
    plt.plot(df_train.index, df_train['weeklySales'],label = 'train')
    plt.plot(df_test.index, df_test['weeklySales'], label ='test')
    plt.legend()
    plt.show()
    msg = 'PLT Done ! \n'
    return msg

#This function creates, tunes, plots, finalizes, predicts, and evaluates all models in mdls list for a set of data
def create_models(mdls, i, test, result_log, result_log_aggr):
    j = str(i)
    for mdl in mdls:
        mdll = create_model('{}'.format(mdl))
        print('\n \n >>mdll = create_model(mdl) for Dept:{} Model:{} IS  DONE! \n \n'.format(j,mdl))
        tuned_mdl = tune_model(mdll, fold = None, n_iter = 20)
        print('\n \n >>tuned_mdl = tune_model(mdll) for Dept:{} Model:{} IS  DONE! \n \n'.format(j,mdl))
        plot_model(mdll)
        print('\n \n >>plot_model(mdll) for Dept:{} Model:{} IS  DONE! \n \n'.format(j,mdl))
        plot_model(mdll, plot = 'error')
        print('\n \n >>plot_model(mdll, plot = error) for Dept:{} Model:{} IS  DONE! \n \n'.format(j,mdl))
        plot_model(tuned_mdl, plot = 'feature')
        print('\n \n >>plot_model(tuned_mdl, plot = feature) for Dept:{} Model:{} IS  DONE! \n \n'.format(j,mdl))
        predict_model(tuned_mdl);
        print('\n \n >>predict_model(tuned_mdl) for Dept:{} Model:{} IS  DONE! \n \n'.format(j,mdl))
        final_mdl = finalize_model(tuned_mdl)
        print('\n \n >>final_mdl = finalize_model(tuned_mdl) for Dept:{} Model:{} IS  DONE! \n \n'.format(j,mdl))
        print(final_mdl)
        print('\n \n >>print(final_mdl) for Dept:{} Model:{} IS  DONE! \n \n'.format(j,mdl))
        evaluate_model(final_mdl)
        print('\n \n >>evaluate_model(final_mdl) for Dept:{} Model:{} IS  DONE! \n \n'.format(j,mdl))
        predict_model(final_mdl)
        print('\n \n >>predict_model(final_mdl) for Dept:{} Model:{} IS  DONE! \n \n'.format(j,mdl))
        pred_mdl = predict_model(final_mdl, data=test)
        pred_mdl.to_csv('pred_output/{}_{}.csv'.format(j,mdl))
        print('\n \n >>pred_mdl = predict_model(final_mdl, data=test) for Dept:{} Model:{} IS  DONE! \n \n'.format(j,mdl))
        result_log[mdl] = pred_mdl.prediction_label
        result_log['{}_ape'.format(mdl)] = ((result_log.actual - result_log[mdl]) / result_log.actual).abs() 
        result_log['{}_pe'.format(mdl)] = ((result_log.actual - result_log[mdl]) / result_log.actual) 
        result_log['{}_pos_pe'.format(mdl)] = result_log[(result_log['{}_pe'.format(mdl)] >= 0)]['{}_pe'.format(mdl)]
        result_log['{}_neg_pe'.format(mdl)] = result_log[(result_log['{}_pe'.format(mdl)] < 0)]['{}_pe'.format(mdl)]
        pos_pe_sum = result_log['{}_pos_pe'.format(mdl)].sum()
        max_pos_pe = result_log['{}_pos_pe'.format(mdl)].max()
        neg_pe_sum = result_log['{}_neg_pe'.format(mdl)].sum()
        max_neg_pe = result_log['{}_neg_pe'.format(mdl)].min()
        mape = result_log['{}_ape'.format(mdl)].mean()
        result_log_aggr.at[i, '{}_pos_pe_sum'.format(mdl)] = pos_pe_sum
        result_log_aggr.at[i, '{}_max_pos_pe'.format(mdl)] = max_pos_pe
        result_log_aggr.at[i, '{}_neg_pe_sum'.format(mdl)] = neg_pe_sum
        result_log_aggr.at[i, '{}_max_neg_pe'.format(mdl)] = max_neg_pe
        result_log_aggr.at[i, '{}_mape'.format(mdl)] = mape
        
        #dept_mape_list.append(result_log['{}_ape'.format(mdl)].mean())
        #result_log_aggr = dept_mape_list.add(mape)
        print('\n \n >> Prediction j_mdl.to_csv for Dept:{} Model:{}  IS  DONE! \n \n'.format(j,mdl))
    return result_log, result_log_aggr
    
        
#This function 
def mlsetup(train, test, j):
    reg = setup(data = train,
            test_data = test,
            target = 'weeklySales',
            #categorical_features = ['Dept','Store'],
            numeric_features = ['dayofweek','quarter','month','year','dayofyear','dayofmonth','weekofyear'],
            preprocess = False,
            imputation_type = None, #We dont want to impute missing values because they are alreay imputed.
            #numeric_imputation = 'mean',
            polynomial_features = False, #it means we do not want to take existing features and rase them to a power to capture non-linear relationships between the feature and the target variable.
            transformation = False,
            normalize = False,
            #normalize_method = 'zscore',
            transform_target = False,
            remove_multicollinearity = False,
            #multicollinearity_threshold = 0.95,
            remove_outliers = False,
            #outliers_method = 'ee' #options are 'ee', 'lof', 'iforest',
            #outliers_threshhold = 0.05,
            feature_selection = False,
            #feature_selection_method = 'sequential',
            #feature_selection_estimator = 'lightgbm',
            #n_features_to_select = 0.2,
            #use_gpu = True,
            #profile = True,
            fold_strategy = 'timeseries', #other options are 'kfold', 'groupkfold', 'timeseries'
            fold = 2,  
            #fold_groups = 'dept',
            data_split_shuffle = False,
            fold_shuffle = False,
           )
    print('\n \n >>ML setup for Dept:{} IS  DONE! \n \n'.format(j))
    best = compare_models(sort = 'MAPE', n_select = 1)
    print('\n \n >>best = compare_models for Dept:{} IS  DONE! \n \n'.format(j))
    evaluate_model(best)
    print('\n \n >>evaluate_model(best) for Dept:{} IS  DONE! \n \n'.format(j))
    return best

#This function
#def err_calc():
    



#This function executes functions of machine learning for each pipe and changes the pipeline each time 
def process(df, num_dict, split_date, mdls):
    result_log_aggr = pd.DataFrame(index=num_dict)
    result_log_aggr['dept'] = num_dict
    #result_log_aggr.index = num_dict.index
    for i in num_dict:
        j = str(i)
        x = slicer(df, i)
        y = aggregator(x)
        z = create_features(y)
        #z['dept'] = i
        train, test = split_data(z, split_date)
        print('>slicer, aggregator, create features, and split_data func for Dept:', i,'is Done! \n')
        plt_test_train(train, test)
        print('\n >plt_test_train func for Dept:', i,'is Done! \n')
        mlsetup(train, test, j)
        print('\n >mlsetup func for Dept:', i,'is Done! \n')
        result_log = pd.DataFrame()
        result_log.index = test.index
        result_log['actual'] = test['weeklySales']
        result_log_dept, result_log_aggr = create_models(mdls, i, test, result_log, result_log_aggr)
        result_log_dept.to_csv('output_analysis/result_log_dept{}.csv'.format(j))
        print('\n >create_models func for Dept:', i,'is Done! \n')
        
    result_log_aggr.to_csv('output_analysis/result_log_aggr.csv')
    process_end_msg = '>>>>>>>>>>> Prediction DONE! <<<<<<<<<<<<'
    print(process_end_msg)
    pass

In [None]:
process(dataset_sub2, dept_list, '2011-10-19', mdls)