# Importing the libraries

In [24]:
"""
    Import the necessary libraries
    The numpy and pandas libraries are commonly used for data manipulation and analysis, 
    matplotlib and seaborn are used for data visualization, 
    and sklearn contains a variety of machine learning algorithms. 
    train_test_split and cross_val_score are used for model evaluation, 
    and LinearRegression is a machine learning algorithm for regression tasks.
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
        
from scipy.stats import skew,norm,zscore
from scipy.signal import periodogram

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier

from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit, GridSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, make_scorer, mean_squared_log_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Importing the dataset

In [25]:
"""
    This code reads in several CSV files containing data for a machine learning project on sales forecasting using time series. 
    The holidays_events, oil, stores, and transactions files are read into orig_holidays_events, orig_oil, orig_stores, 
    and orig_transactions DataFrames, respectively. 
    
    The test and train files are also read into DataFrames named orig_test and orig_train, respectively.
    
    The parse_dates option is used when reading the CSV files to specify that the date column should be parsed as a date instead of a string. 
    This allows the date values to be treated as dates in the DataFrames, which makes it easier to work with them and perform time series analysis.
"""

# Read csv files and parse dates for date columns
orig_holidays_events = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv", parse_dates=['date'])
orig_oil = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv", parse_dates=['date'])
orig_stores = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
orig_transactions = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv", parse_dates=['date'])

# Read test and train csv files and parse dates for date columns
orig_test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv", parse_dates=['date'])
orig_train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv", parse_dates=['date'])

# Setting dates

In [26]:
"""
    This code is used to set the dates for the train, test, and forecast periods.
    date_start_train and date_end_train specify the start and end dates for the training data, 
    and date_start_test and date_end_test specify the start and end dates for the test data. 
    The date_start_fore date may be used to specify the start of a forecasting period.
"""

date = {
    'date_start_train': '2013-01-01',
    'date_end_train': '2017-08-15',
    'date_start_test': '2017-08-16',
    'date_end_test': '2017-08-31',
    'date_start_fore': '2016-06-01'
}

In [27]:
"""
    This code computes the number of days between two dates for the training and test data. 
    It does this by using the pd.Timestamp class from the pandas library to convert the dates stored in the date dictionary to Timestamp objects. 
    Then, it subtracts the two Timestamp objects to compute the number of days between them.

    The diff_train variable stores the number of days between the date_end_train and date_start_fore dates, 
    and the diff_test variable stores the number of days between the date_end_test and date_start_fore dates.
"""

diff_train = (pd.Timestamp(date['date_end_train']) - pd.Timestamp(date['date_start_fore'])).days
diff_test = (pd.Timestamp(date['date_end_test']) - pd.Timestamp(date['date_start_fore'])).days

# Stores Preprocessing

In [28]:
"""
    This function store_func takes a DataFrame as an argument. 
    The function adds some new features to the DataFrame and then merges it with the orig_train and orig_test DataFrames. 
    The resulting DataFrame is returned by the function.
    
    Inside the function, the uniquestore and newstore features are added to the DataFrame by applying anonymous functions (also known as lambda functions) to the city and store_nbr columns. 
    These features may be useful for the machine learning model because they encode additional information about the stores in the data.
    
    Next, the function uses the concat method to combine the orig_train and orig_test DataFrames into a single DataFrame. 
    Then, it uses the merge method to merge this DataFrame with the DataFrame passed to the store_func function, based on the store_nbr column. 
    Finally, the function renames the type column to store and returns the resulting DataFrame.
"""

def store_func (orig_df):
    
    # Create a copy of the original dataframe
    df = orig_df.copy()
    
    # Add features to the original stores dataframe
    df['uniquestore'] = df.city.apply(lambda x: 0 if x in ['Quito', 'Guayaquil', 'Santo Domingo', 'Cuenca', 'Manta', 'Machala', 'Latacunga', 'Ambato'] else 1)
    df['newstore'] = df.store_nbr.apply(lambda x: 1 if x in [19, 20, 21, 28, 35, 41, 51, 52] else 0)
        
    # Merge the original stores dataframe, original test dataframe and original train dataframe
    df = pd.concat([orig_train, orig_test], axis=0).merge(df, on=['store_nbr'], how='left')
    df = df.rename(columns={'type' : 'store'}) 

    return df

In [29]:
"""
    This code calls the store_func function and passes the orig_stores DataFrame as an argument. 
    The function returns a new DataFrame that includes the original data from orig_stores, 
    as well as some additional features and data from the orig_train and orig_test DataFrames. 
    
    This new DataFrame is stored in the final_df variable.
"""

final_df = store_func(orig_stores)

# Events Preprocessing

In [30]:
"""
    This code defines a function holiday_func that takes a DataFrame as an argument. 
    The function processes the data in the DataFrame and then merges it with the final_df DataFrame, which was created by the store_func function earlier. 
    The resulting DataFrame is returned by the function.

    Inside the function, the first step is to create a copy of the input DataFrame and store it in a new variable named df. 
    Then, the function makes several modifications to the data in df. 
    It removes duplicates and adds new features, such as event_type and isevent.
    
    Next, the function uses the merge method to merge the final_df DataFrame with df. 
    This combines the data from both DataFrames into a single DataFrame that can be used to train a machine learning model.
    
    Finally, the function applies a series of transformations to the data in the resulting DataFrame, such as adding Easter and closure days. 
    The resulting DataFrame is returned by the function.
"""

def holiday_func (orig_df):
    
    df = orig_df.copy()
    
    # Non-transferred events
    df.loc[297, 'transferred'] = df.loc[297, 'transferred'] = False
    df = df.query("transferred!=True")
    
    # Removing duplicates
    df = df.drop(index=orig_holidays_events[orig_holidays_events[['date', 'locale_name']].duplicated()].index.values)

    # Adding event type
    df.loc[df.type=='Event', 'type'] = df.description.apply(lambda x: x[0:7])
     
    # Merging orig_holidays_events and final_df
    nat_df = df.query("locale=='National'")
    loc_df = df.query("locale=='Local'")
    reg_df = df.query("locale=='Regional'")
    
    df = final_df.merge(nat_df, left_on=['date'], right_on=['date'], how='left')
    df = df.merge(loc_df, left_on=['date', 'city'], right_on=['date', 'locale_name'], how='left')
    df = df.merge(reg_df, left_on=['date', 'state'], right_on=['date', 'locale_name'], how='left')
   
    # Adding New Year
    df['firstday'] = df.description_x.apply(lambda x: 1 if x=='Primer dia del ano' else 0)

    # Matching event and store
    df = df.drop(columns=['locale_x', 'locale_name_x', 'description_x', 'transferred_x',
                          'locale_y', 'locale_name_y', 'description_y', 'transferred_y',
                          'locale', 'locale_name', 'description', 'transferred'])
    df.loc[~df.type_x.isnull(), 'event_type'] = df.type_x.apply(lambda x: x)
    df.loc[~df.type_y.isnull(), 'event_type'] = df.type_y.apply(lambda x: x)
    df.loc[~df.type.isnull(), 'event_type'] = df.type.apply(lambda x: x)
    df.loc[df.event_type.isnull(), 'event_type'] = df.event_type.apply(lambda x: 'norm')
    df = df.drop(columns=['type_x', 'type_y', 'type'])

    df['isevent'] = df.event_type.apply(lambda x: 'y' if x!='norm' else 'n')

    # Adding Easter
    df.loc[df.date.isin(['2017-04-16', '2016-03-27', '2015-04-05', '2014-04-20', '2013-03-31']), 'isevent'] = df.isevent.apply(lambda x: 'y')
    df.loc[df.date.isin(['2017-04-16', '2016-03-27', '2015-04-05', '2014-04-20', '2013-03-31']), 'event_type'] = df.event_type.apply(lambda x: 'Holiday')

    # Adding closure days
    df['isclosed'] = df.groupby(by=['date', 'store_nbr'])['sales'].transform(lambda x: 1 if x.sum()==0 else 0)    
    df.loc[(df.date.dt.year==2017) & (df.date.dt.month==8) & (df.date.dt.day>=16) , 'isclosed'] = df.isclosed.apply(lambda x: 0)    
    df.loc[df.date.isin(['2017-01-01']), 'isevent'] = df.isevent.apply(lambda x: 'n')
  
    return df

In [31]:
"""
    This code calls the holiday_func function that was defined earlier and passes the orig_holidays_events DataFrame as an argument. 
    The function returns a new DataFrame that includes the original data from orig_holidays_events, as well as some additional features and data from the final_df DataFrame. 
    This new DataFrame is stored in the final_df variable.
    
    holiday_func function is being used to further preprocess the data in preparation for training a machine learning model. 
    The new features and data added to the DataFrame by the function are useful for the model and improve its performance. 
    This code overwrites the previous value of final_df, which was created by the store_func function.
"""

final_df = holiday_func(orig_holidays_events)

# Oil Preprocessing

In [32]:
"""
    This code reads in the oil.csv file, which contains data about the price of oil. 
    It uses the pd.read_csv function from the pandas library to read the data into a DataFrame, 
    and then it uses the to_datetime method to convert the date column from a string to a datetime format.
    
    Next, the code sets the date column as the index of the DataFrame using the set_index method. 
    This allows the data to be easily accessed and manipulated using the dates as keys.
    
    The code then uses the resample method to resample the data to a daily frequency and compute the mean price of oil for each day. 
    Finally, it uses the isnull and sum methods to compute the number of missing values in the DataFrame. 
    
    This information may be useful for understanding the quality of the data and determining whether any further preprocessing is needed.
"""

# read in the oil data
oil = pd.read_csv('oil.csv')

# change the date column to a datetime format
oil['date'] = pd.to_datetime(oil['date'])

# set the date as the index
oil = oil.set_index('date')

# resample the data to daily and get the mean
oil = oil.resample("D").mean()

# get the number of null values
oil.isnull().sum()


dcoilwtico    529
dtype: int64

In [33]:
"""
    This code defines a function oil_func that takes a DataFrame as an argument. 
    The function processes the data in the DataFrame 
    and then merges it with the final_df DataFrame, which was created by the holiday_func function earlier. 
    The resulting DataFrame is returned by the function.
    
    Inside the function, the first step is to create a copy of the input DataFrame and store it in a new variable named df. 
    Then, the function adds missing values to df by using the set_index method to set the date column as the index, 
    the resample method to resample the data to a daily frequency, the interpolate method to fill in missing values, 
    and the reset_index method to restore the original index of the DataFrame.
    
    Next, the function adds several lag features to the data by using a for loop to shift the dcoilwtico column 
    by a specified number of days and create a new column for each shift. 
    The function also adds moving average features by using the rolling method to compute 
    the average value of the dcoilwtico column over a specified number of days.
    
    Finally, the function uses the merge method to merge the final_df DataFrame with df, 
    and then it returns the resulting DataFrame. This DataFrame includes the original data from both final_df and df, 
    as well as the new features that were added by the oil_func function.
"""

def oil_func (orig_df):
    
    df = orig_df.copy()
    
    # Adding missing values
    df = df.set_index('date').resample("D").mean().interpolate(limit_direction='backward').reset_index()
    
    # Adding lag features
    for i in [1, 2, 3, 4, 5, 6, 7, 10, 14, 21, 30, 60, 90]:
        df['lagoil_' + str(i) + '_dcoilwtico'] = df['dcoilwtico'].shift(i)
    
    # Adding moving average features
    df['oil_week_avg'] = df['dcoilwtico'].rolling(7).mean()
    df['oil_2weeks_avg'] = df['dcoilwtico'].rolling(14).mean()
    df['oil_month_avg'] = df['dcoilwtico'].rolling(30).mean()

    # drop rows with null values
    df.dropna(inplace = True)
    
    # Merging orig_oil and final_df
    df = final_df.merge(df, on=['date'], how='left')
    
    return df

In [34]:
"""
    This code calls the oil_func function that was defined earlier and passes the orig_oil DataFrame as an argument. 
    The function returns a new DataFrame that includes the original data from orig_oil, 
    as well as some additional features and data from the final_df DataFrame. 
    This new DataFrame is stored in the final_df variable.
    
    The oil_func function is being used to further preprocess the data in preparation for training a machine learning model. 
    The new features and data added to the DataFrame by the function are useful for the model and improve its performance. 
    
    This code overwrites the previous value of final_df, which was created by the holiday_func function.
"""

# store the dataframe in the final_df
final_df = oil_func(orig_oil)

# Transactions Preprocessing

In [35]:
"""
    This code defines a function transactions_func that takes a DataFrame as an argument. 
    The function processes the data in the DataFrame and then merges it with the final_df DataFrame, 
    which was created by the oil_func function earlier. The resulting DataFrame is returned by the function.
    
    Inside the function, the first step is to create a copy of the input DataFrame and store it in a new variable named df. 
    Then, the function uses the merge method to merge df with the final_df DataFrame, 
    which includes the data from orig_oil and orig_holidays_events.
    
    Next, the function fills in missing values in the transactions column of df. 
    It does this by using the loc method to identify rows where the transactions column is null and the isclosed column is 1, 
    and then it sets the transactions value to 0 for these rows.
    
    The function then groups the data by store_nbr and date, computes the average transactions for each group, 
    and adds some lag features to the data. It then uses the drop method to remove the transactions column, 
    and then it merges the resulting DataFrame with the original df. 
    
    Finally, the function fills in any remaining missing values in the transactions column 
    by using the average transactions computed earlier, and then it returns the resulting DataFrame.
"""

def transactions_func (orig_df):
    
    df = orig_df.copy()
    
    # Merging orig_transactions and final_df
    df = final_df.merge(df, on=['date', 'store_nbr'], how='left')
    
    # Filling missing values
    df.loc[(df.transactions.isnull()) & (df.isclosed==1), 'transactions'] = df.transactions.apply(lambda x: 0)
    group_df = df.groupby(by=['store_nbr', 'date']).transactions.first().reset_index()
    group_df['avg_tra'] = group_df.transactions.rolling(15, min_periods=10).mean()
    group_df['16_tra'] = group_df.transactions.shift(16)
    group_df['21_tra'] = group_df.transactions.shift(21)
    group_df['30_tra'] = group_df.transactions.shift(30)
    group_df['60_tra'] = group_df.transactions.shift(60)
    
    group_df.drop(columns='transactions', inplace=True)
    
    df = df.merge(group_df, on=['date', 'store_nbr'], how='left')
    df.loc[(df.transactions.isnull()) & (df.isclosed==0), 'transactions'] = df.avg_tra
    
    df.drop(columns='avg_tra', inplace=True)
    df.loc[(df.date.dt.year==2017) & (df.date.dt.month==8) & (df.date.dt.day>=16) , 'transactions'] = df.transactions.apply(lambda x: None)    

    df['tot_store_day_onprom'] = df.groupby(by=['date', 'store_nbr']).onpromotion.transform(lambda x: x.sum())

    return df

In [36]:
"""
    This code calls the transactions_func function that was defined earlier and passes the orig_transactions DataFrame as an argument. 
    The function returns a new DataFrame that includes the original data from orig_transactions, 
    as well as some additional features and data from the final_df DataFrame. 
    This new DataFrame is stored in the final_df variable.

    The transactions_func function is being used to further preprocess the data in preparation for training a machine learning model. 
    The new features and data added to the DataFrame by the function are useful for the model and improve its performance. 
    This code overwrites the previous value of final_df, which was created by the oil_func function
"""

# store the dataframe in the final_df
final_df = transactions_func(orig_transactions)

In [37]:
"""
    This code uses the del keyword to delete several variables that were defined earlier in the code. 
    These variables include the original data that was read in from CSV files at the beginning of the script, 
    as well as some intermediate DataFrames that were created by the various preprocessing functions.
    
    Deleting these variables frees up memory and prevents them from cluttering the workspace. 
    Since the data in these variables has been processed and stored in the final final_df DataFrame, 
    it is no longer necessary to keep the original variables around.
"""

del orig_train
del orig_test
del orig_stores
del orig_holidays_events
del orig_oil
del orig_transactions

# Final dataframe

In [38]:
"""
    This code uses the set_index and loc methods to create a new DataFrame that includes only the rows in the final_df DataFrame 
    that have a date greater than or equal to the date_start_fore value specified in the date dictionary. 
    The set_index method sets the date column as the index for the DataFrame, 
    and the loc method is used to filter the rows based on their date values. 
    The resulting DataFrame is then stored back in the final_df variable.
    
    This code is further preprocessing the data in preparation for training a machine learning model. 
    By limiting the data to only those rows with dates greater than or equal to the date_start_fore value, 
    the code is creating a training set for the model that includes only the most recent data. 
    This could improve the performance of the model and make it more accurate.
"""

final_df = final_df.set_index('date').loc[date['date_start_fore']:,:]

# Additional Support functions

In [39]:
"""
    This code defines a split_func function that takes four arguments: orig_df, X, y, end_date, and test_size. 
    orig_df is a DataFrame that contains the original data, 
    X is a DataFrame that contains the input features for the machine learning model, 
    y is a DataFrame that contains the target variable for the model, 
    end_date is a date that specifies the end of the training period, 
    and test_size is a float that indicates the proportion of the data that should be used for testing.

    The function uses the train_test_split function from the sklearn.model_selection module to split the data into training and test sets. 
    This function takes the X and y DataFrames as arguments and returns four DataFrames: X_train, y_train, X_test, and y_test. 
    These DataFrames contain the input features and target variable for the training and test sets, respectively. 
    The split_func function then returns these four DataFrames.
    
    This function is used to split the data into training and test sets in preparation for training a machine learning model. 
    The train_test_split function ensures that the data is split into mutually exclusive sets, 
    so that the training set is used to train the model and the test set is used to evaluate its performance. 
    
    This is an important step in the machine learning process, 
    as it helps to prevent overfitting and ensures that the model will generalize well to new data.
"""

def split_func (orig_df, X, y, end_date, test_size):
    """
    This function splits the original dataframe into a training set and testing set
    using a specified test size. The original dataframe is split into X and y dataframes
    and the X_train and X_test dataframes are returned along with the y_train and y_test
    dataframes.
    
    Parameters
    ----------
    orig_df: pandas DataFrame
        The original dataframe that will be split into train and test sets.
    X: pandas DataFrame
        The X portion of the original dataframe.
    y: pandas DataFrame
        The y portion of the original dataframe.
    end_date: datetime
        The end date of the train and test split.
    test_size: float
        The percentage of the original dataframe to be used for testing.
    
    Returns
    -------
    X_train: pandas DataFrame
        The X portion of the training dataframe.
    y_train: pandas DataFrame
        The y portion of the training dataframe.
    X_test: pandas DataFrame
        The X portion of the testing dataframe.
    y_test: pandas DataFrame
        The y portion of the testing dataframe.
    """
    
    # Splitting train and test
    idx_train, idx_test = train_test_split(orig_df.index, test_size=test_size, shuffle=False)
    X_train, X_test = X.loc[idx_train, :], X.loc[idx_test, :]
    y_train, y_test = y.loc[idx_train], y.loc[idx_test]
    
    return X_train, y_train, X_test, y_test

In [40]:
"""
    The my_split_func() function takes a dataframe and creates a train set, validation set, and the corresponding target sets. 
    The train set and validation set are created by filtering the dataframe by the specified start and end dates for each set. 
    The target sets are created by selecting the 'sales' column from the corresponding sets.
    
    The start and end dates for the train and validation sets are specified as function parameters. 
    The function also calculates the number of days from the start of the year 2013 to each date in the original dataframe 
    and adds this as a new column called 'days_from_2013' to the dataframe. 
    
    This new column is used to filter the dataframe to create the train and validation sets.
    After creating the sets, the function drops the 'sales' column from the feature sets and returns all four sets.

    train_start_date is the first date in the training set
    train_end_date is the last date in the training set
    val_start_date is the first date in the validation set
    val_end_date is the last date in the validation set
"""

def my_split_func(df, train_start_date='2013-01-01', train_end_date='2017-08-30',
               val_start_date='2017-09-01', val_end_date='2020-01-01'):
    train_start_date = (pd.to_datetime(train_start_date) - pd.to_datetime('2013-01-01')).days
    train_end_date = (pd.to_datetime(train_end_date) - pd.to_datetime('2013-01-01')).days
    val_start_date = (pd.to_datetime(val_start_date) - pd.to_datetime('2013-01-01')).days
    val_end_date = (pd.to_datetime(val_end_date) - pd.to_datetime('2013-01-01')).days
    
    train = df[(df['days_from_2013'] >= train_start_date) & (df['days_from_2013'] <= train_end_date)]
    val = df[(df['days_from_2013'] >= val_start_date) & (df['days_from_2013'] <= val_end_date)]
    return [train.drop(columns=['sales']), val.drop(columns=['sales']), train['sales'], val['sales']]

In [41]:
"""
    This function takes in a number tp and a series of dates, and returns an array of weights. 
    The value of the weight depends on the value of tp:
    
    If tp is 1, the function returns an array of ones with the same length as the dates series.
    
    If tp is 2, the function returns an array of weights that decrease exponentially as the date gets closer to August 16, 2017. 
    The rate of the decay is determined by the formula np.exp((400 - (pd.to_datetime('2017-08-16') - pd.to_datetime(dates)).days) / 100), 
    where 400 is a constant that determines the starting weight, '2017-08-16' is the reference date, and dates is the series of dates.
    
    If tp is 3, the function returns an array of weights that decrease exponentially as the date gets closer to August 16, 2017, 
    but at a slower rate than for tp = 2. The rate of the decay is determined by the formula 
    np.exp((400 - (pd.to_datetime('2017-08-16') - pd.to_datetime(dates)).days) / 200).
    
    If tp is 4, the function returns an array of weights that decrease exponentially as the date gets closer to August 16, 2017, 
    but at an even slower rate than for tp = 3. 
    The rate of the decay is determined by the formula np.exp((400 - (pd.to_datetime('2017-08-16') - pd.to_datetime(dates)).days) / 300).
    
    If tp is 5, the function returns an array of weights that decrease exponentially as the date gets closer to August 16, 2017, 
    but at the slowest rate of all the possible values of tp. 
    The rate of the decay is determined by the formula np.exp((400 - (pd.to_datetime('2017-08-16') - pd.to_datetime(dates)).days) / 400).
"""

def get_weights_distribution(tp, dates):
    if tp == 1:
        return np.ones(dates.shape)
    if tp == 2:
        return np.exp((400 - (pd.to_datetime('2017-08-16') - pd.to_datetime(dates)).days) / 100)
    if tp == 3:
        return np.exp((400 - (pd.to_datetime('2017-08-16') - pd.to_datetime(dates)).days) / 200)
    if tp == 4:
        return np.exp((400 - (pd.to_datetime('2017-08-16') - pd.to_datetime(dates)).days) / 300)
    if tp == 5:
        return np.exp((400 - (pd.to_datetime('2017-08-16') - pd.to_datetime(dates)).days) / 400)

In [42]:
"""
    The tags_to_dict() function takes no arguments and returns a list of five dictionaries. 
    The first dictionary maps store item tags to an integer value representing the average percentage of male shoppers that buy the items with that tag. 
    The second dictionary maps store item tags to an integer value representing the average percentage of shoppers that buy luxury items with that tag. 
    The third dictionary maps store item tags to an integer value representing the average age of shoppers that buy items with that tag. 
    The fourth dictionary maps store item tags to an integer value representing the variance in the age of shoppers that buy items with that tag. 
    The fifth dictionary maps store item tags to a string value representing the type of items with that tag (e.g., food, family, other).
"""

def tags_to_dict():
    tags = {
     'AUTOMOTIVE': [4, 7, 30, 10, 'family'],
     'BABY CARE':  [-8, 2, 25, 5, 'family'],
     'BEAUTY': [-8, 7, 25, 5, 'other'],
     'BEVERAGES': [0, 0, 40, 40, 'food'],
     'BOOKS': [0, 0, 55, 15, 'other'],
     'BREAD/BAKERY': [-3, 0, 30, 30, 'food'],
     'CELEBRATION': [-5, 5, 50, 20, 'family'],
     'CLEANING': [-8, 3, 40, 20, 'food'],
     'DAIRY': [-4, 0, 40, 40, 'food'],
     'DELI': [3, 6, 40, 20, 'food'],
     'EGGS': [-4, -5, 40, 20, 'food'],
     'FROZEN FOODS': [-4, -3, 40, 20, 'food'],
     'GROCERY I': [-4, 3, 40, 20, 'food'],
     'GROCERY II': [-4, 3, 40, 20, 'food'],
     'HARDWARE': [10, 10, 30, 20, 'other'],
     'HOME AND KITCHEN I': [-10, 4, 40, 20, 'family'],
     'HOME AND KITCHEN II': [-10, 4, 40, 20, 'family'],
     'HOME APPLIANCES': [0, 4, 40, 20, 'family'],
     'HOME CARE': [-10, 4, 40, 20, 'family'],
     'LADIESWEAR': [-10, 4, 40, 20, 'other'],
     'LAWN AND GARDEN': [-10, 4, 40, 20, 'family'],
     'LINGERIE': [-10, 4, 40, 2, 'other'],
     'LIQUOR,WINE,BEER': [4, 8, 40, 20, 'food'],
     'MAGAZINES': [-6, -7, 50, 20, 'other'],
     'MEATS': [-4, 5, 40, 20, 'food'],
     'PERSONAL CARE': [-5, 5, 40, 20, 'family'],
     'PET SUPPLIES': [-5, 0, 40, 20, 'family'],
     'PLAYERS AND ELECTRONICS': [5, 5, 25, 10, 'other'],
     'POULTRY': [-7, -4, 40, 20, 'food'],
     'PREPARED FOODS': [0, 6, 30, 10, 'food'],
     'PRODUCE': [0, 0, 40, 40, 'other'],
     'SCHOOL AND OFFICE SUPPLIES': [3, 3, 25, 15, 'family'],
     'SEAFOOD': [-5, 8, 40, 20, 'food']
    }
    
    sex_dict = {}
    luxury_dict = {}
    age_mean_dict = {}
    age_var_dict = {}
    type_dict = {}
    for i in tags.keys():
        sex_dict[i] = tags[i][0]
        luxury_dict[i] = tags[i][1]
        age_mean_dict[i] = tags[i][2]
        age_var_dict[i] = tags[i][3]
        type_dict[i] = tags[i][4]
    return [sex_dict, luxury_dict, age_mean_dict, age_var_dict, type_dict]

In [43]:
"""
    The code creates a function called get_oil_dict() that takes in a DataFrame called oil as an input. 
    The function estimates the prices of gaps (times when the market was not open) in the oil prices data, 
    and then creates a dictionary where the keys are the day numbers from the start of 2013 
    (as indicated by the days_from_2013 column in the oil DataFrame) and the values are the estimated oil prices for each day. 
    
    The function then returns this dictionary.
"""

def get_oil_dict(oil):
    # estimate price of gaps (market don't work on weekends and holidays)
    price_estim = [-1] * (oil['days_from_2013'][oil.shape[0] - 1] + 1)
    price_estim[0] = 93.14
    for i in range(1, oil.shape[0]):
        price_estim[oil['days_from_2013'][i]] = oil['dcoilwtico'][i]

    for i in range (len(price_estim)):
        if price_estim[i] == -1 or math.isnan(price_estim[i]):
            tj = -1
            for j in range(i + 1, len(price_estim)):
                if price_estim[j] != -1 and (not math.isnan(price_estim[j])):
                    tj = j
                    break

            for j in range(i, tj):
                price_estim[j] = ((tj - j) * price_estim[i - 1] + (j - i) * price_estim[tj]) / (tj - i)

            i = tj

    oil_dict = dict(zip(np.arange(len(price_estim)), price_estim))
    return oil_dict

# Time Preprocessing columns

In [44]:
"""
    The add_features function is used to add new features to the original dataframe. 
    The new features include the year, quarter, month, day, day of week, week of year, and whether the day is a weekend or not. 
    The function also adds one-hot encoded features for the year, quarter, day of week, store, event type, and state. 
    It also adds features generated by the DeterministicProcess class. 
    Finally, the function adds a new feature called outliers which indicates whether a sales value is an outlier (defined as a value greater than 30,000). 
    The function then drops the daysinmonth, month, and city columns from the dataframe.
"""

def add_features (orig_df):
    
    # Make a copy of the original DataFrame
    df = orig_df.copy()
    
    # Add new features
    df['year'] = df.index.year.astype('int')
    df['quarter'] = df.index.quarter.astype('int')
    df['month'] = df.index.month.astype('int')
    df['day'] = df.index.day.astype('int')
    df['dayofweek'] = df.index.day_of_week.astype('int')
    df['weekofyear'] = df.index.week.astype('int')
    df['isweekend'] = df.dayofweek.apply(lambda x: 1 if x in (5,6) else 0)
    df['startschool'] = df.month.apply(lambda x: 1 if x in (4,5,8,9) else 0)
    
    df['daysinmonth'] = df.index.days_in_month.astype('int')
    
    # Add one-hot encoded features
    df = pd.get_dummies(df, columns=['year'], drop_first=True)
    df = pd.get_dummies(df, columns=['quarter'], drop_first=True)
    df = pd.get_dummies(df, columns=['dayofweek'], drop_first=True)
    df = pd.get_dummies(df, columns=['store'], drop_first=True)
    df = pd.get_dummies(df, columns=['event_type'], drop_first=True)
    df = pd.get_dummies(df, columns=['isevent'], drop_first=True)
    df = pd.get_dummies(df, columns=['state'], drop_first=True)
    
    # Add DeterministicProcess features
    fourierA = CalendarFourier(freq='A', order=5)
    fourierM = CalendarFourier(freq='M', order=2)
    fourierW = CalendarFourier(freq='W', order=4)

    dp = DeterministicProcess(index=df.index,
                          order=1,
                          seasonal=False,
                          constant=False,
                          additional_terms=[fourierA, fourierM, fourierW],
                          drop=True)
    dp_df = dp.in_sample()
    df = pd.concat([df, dp_df], axis=1)
    
    # Add outliers
    df['outliers'] = df.sales.apply(lambda x: 1 if x>30000 else 0)
    
    # Remove unnecessary features
    df.drop(columns=['daysinmonth', 'month', 'city'], inplace=True)
    
    return df

In [45]:
"""
    The add_my function is used to add new features to the original dataframe.
"""
import math

def add_my(df):
    # read
    train = pd.read_csv('../input/store-sales-time-series-forecasting/train.csv')
    oil = pd.read_csv('../input/store-sales-time-series-forecasting/oil.csv')
    trans = pd.read_csv('../input/store-sales-time-series-forecasting/transactions.csv')
    
    # add 'days_from_2013' for easy shifting
    df['days_from_2013'] = (pd.to_datetime(df.index.get_level_values(2)) - pd.to_datetime('2013-01-01')).days
    train['days_from_2013'] = (pd.to_datetime(train['date']) - pd.to_datetime('2013-01-01')).dt.days
    oil['days_from_2013'] = (pd.to_datetime(oil['date']) - pd.to_datetime('2013-01-01')).dt.days
    trans['days_from_2013'] = (pd.to_datetime(trans['date']) - pd.to_datetime('2013-01-01')).dt.days
    
    # groupby features
    gr_day = train.groupby('days_from_2013')['sales'].mean()
    gr_store = train.groupby('store_nbr')['sales'].mean()
    gr_family = train.groupby('family')['sales'].mean()

    days = [16, 18, 20, 21, 25, 28, 30, 35, 42, 60, 90, 120, 180, 365]
    for i in days:
        df['days_' + str(i)] = df['days_from_2013'] - i
        df['days_lagged' + str(i)] = df['days_' + str(i)].map(gr_day).fillna(0)
        df = df.drop(columns=['days_' + str(i)])

    df['store_gb'] = df.index.get_level_values(0).map(gr_store)
    df['family_gb'] = df.index.get_level_values(1).map(gr_family)
    
    oil_dict = get_oil_dict(oil)

    # lagged oil
    days = [0, 1, 2, 3, 4, 5, 6, 7, 10, 14, 21, 30, 60, 90, 120, 180, 360]
    for i in days:
        df['days_' + str(i)] = df['days_from_2013'] - i
        df['oil_lagged' + str(i)] = df['days_' + str(i)].map(oil_dict)
        df = df.drop(columns=['days_' + str(i)])
        
    # lagged transactions
    # # fill trans dict
    trans_dict = {}
    for ii in range(trans.shape[0]):
        i = trans.loc[ii]
        trans_dict[tuple([i['store_nbr'], i['days_from_2013']])] = i['transactions']
    
    def transaction_get_value(a, b):
        try:
            return trans_dict[tuple([a, (pd.to_datetime(b) - pd.to_datetime('2013-01-01').dt.days)])]
        except:
            return 0
    # Create a list with the number of days that need to be lagged
    # For each of the days, create a new column with the number of days lagged and another column with the oil price
    # lagged to the number of days
    days = [16, 18, 20, 21, 25, 28, 30, 35, 42, 60, 90, 120, 180, 365]
    for i in days:
        df['days_' + str(i)] = df['days_from_2013'] - i
        df['oil_lagged' + str(i)] = df['days_' + str(i)].map(oil_dict)
        df['trans_lagged' + str(i)] = [transaction_get_value(*a) for a in tuple(zip(df.index.get_level_values(0),
                                                                        df.index.get_level_values(2)))]
        df = df.drop(columns=['days_' + str(i)])

    sex_dict, luxury_dict, age_mean_dict, age_var_dict, type_dict = tags_to_dict()
    df['tag_sex'] = df.index.get_level_values(1).map(sex_dict)
    df['tag_luxury'] = df.index.get_level_values(1).map(luxury_dict)
    df['tag_age_mean'] = df.index.get_level_values(1).map(age_mean_dict)
    df['tag_age_var'] = df.index.get_level_values(1).map(age_var_dict)
    df['tag_type'] = df.index.get_level_values(1).map(type_dict)
    df = pd.get_dummies(df, columns=['tag_type'])
    
    df['tag_age_min'] = df['tag_age_mean'] - df['tag_age_var']
    df['tag_age_max'] = df['tag_age_mean'] + df['tag_age_var']
    return df

In [46]:
df = add_features(final_df).loc[:date['date_end_test'],:].reset_index().set_index(['store_nbr', 'family', 'date']).sort_index()

# Fill in missing data with zeros
df['16_tra'] = df['16_tra'].fillna(0)
df['21_tra'] = df['21_tra'].fillna(0)
df['30_tra'] = df['30_tra'].fillna(0)
df['60_tra'] = df['60_tra'].fillna(0)

# Add additional features using the `add_my` function
df = add_my(df)

# Display the resulting DataFrame
display(df)

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,sales,onpromotion,cluster,uniquestore,newstore,firstday,isclosed,dcoilwtico,lagoil_1_dcoilwtico,...,trans_lagged365,tag_sex,tag_luxury,tag_age_mean,tag_age_var,tag_type_family,tag_type_food,tag_type_other,tag_age_min,tag_age_max
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,AUTOMOTIVE,2016-06-01,2216808,3.0,0,13,0,0,0,0,49.070000,49.100000,...,0,4,7,30,10,1,0,0,20,40
1,AUTOMOTIVE,2016-06-02,2218590,1.0,0,13,0,0,0,0,49.140000,49.070000,...,0,4,7,30,10,1,0,0,20,40
1,AUTOMOTIVE,2016-06-03,2220372,4.0,0,13,0,0,0,0,48.690000,49.140000,...,0,4,7,30,10,1,0,0,20,40
1,AUTOMOTIVE,2016-06-04,2222154,9.0,0,13,0,0,0,0,49.030000,48.690000,...,0,4,7,30,10,1,0,0,20,40
1,AUTOMOTIVE,2016-06-05,2223936,2.0,0,13,0,0,0,0,49.370000,49.030000,...,0,4,7,30,10,1,0,0,20,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,SEAFOOD,2017-08-27,3022139,,0,3,1,0,0,0,46.816667,47.233333,...,0,-5,8,40,20,0,1,0,20,60
54,SEAFOOD,2017-08-28,3023921,,0,3,1,0,0,0,46.400000,46.816667,...,0,-5,8,40,20,0,1,0,20,60
54,SEAFOOD,2017-08-29,3025703,,0,3,1,0,0,0,46.460000,46.400000,...,0,-5,8,40,20,0,1,0,20,60
54,SEAFOOD,2017-08-30,3027485,,0,3,1,0,0,0,45.960000,46.460000,...,0,-5,8,40,20,0,1,0,20,60


# Sklearn Random Forest Regression

In [47]:
from sklearn.linear_model import LinearRegression
# Linear Regression model
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# Random Forest and Gradient Boosting models
from sklearn.metrics import mean_squared_log_error as msle
# Metric for regression problems
from tqdm import tqdm
# Progress bar

from xgboost import XGBRegressor
# XGBoost model
from lightgbm import LGBMRegressor
# LightGBM model
from catboost import Pool, CatBoostRegressor
# CatBoost model

import optuna
# Optuna for hyperparameter tuning

In [48]:
"""
This function is used to optimize the hyperparameters of the model
"""

def objective(trial):
    # Define parameters
    params = {
        'criterion': 'squared_error',
        'bootstrap': trial.suggest_categorical('bootstrap',['True','False']),
        'max_depth': trial.suggest_int('max_depth', 1, 10000),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt','log2']),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 1, 10000),
        'n_estimators': trial.suggest_int('n_estimators', 30, 5000),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 100),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 50)
    }
    weights_distribution = trial.suggest_categorical('weights_distribution', [1, 2, 3, 4, 5])
    
    # Define variables to store the result
    sm_rf = 0
    cnt = 0
    
    # Iterate through each store_id
    for i in df.index.get_level_values(0).unique():
        # Iterate through each dept_id
        for j in df.index.get_level_values(1).unique():
            # Select rows for a store_id-dept_id pair
            df_ = df.loc[(i, j)]
            # Drop 'id', 'transactions' columns
            df_ = df_.drop(columns=['id', 'transactions'])

            # Split train and test
            train = df_[~df_['sales'].isna()]
            X_test = df_[df_['sales'].isna()].drop(columns=['sales'])
            X_train, X_val, y_train, y_val = my_split_func(train)

            # Apply log transformation
            y_train = np.log1p(y_train)
            y_val = np.log1p(y_val)
            
            # Create model
            model = RandomForestRegressor(**params)
            # Define weights
            weights = get_weights_distribution(weights_distribution, X_train.index)
            # Fit the model
            model.fit(X_train, y_train, sample_weight=weights)
            # Make predictions
            preds = model.predict(X_val)
            
            # Calculate MSLE
            sm_rf += msle(np.exp(y_val) - 1, (np.exp(preds) - 1).clip(0))
            cnt += 1

    # Calculate RMSLE
    logs.append([(sm_rf / cnt)**0.5, params, weights_distribution])
    return (sm_rf / cnt)**0.5

In [49]:
# # 1. Create a study to track each trial.
# study = optuna.create_study(direction='minimize')

# # 2. Run the optimization task.
# study.optimize(objective, n_trials=1000)

In [50]:
# Read the sample submission file

ss = pd.read_csv('../input/store-sales-time-series-forecasting/sample_submission.csv')

In [52]:
# Best parameters from the trails.
good_p = [{'criterion': 'squared_error',
  'bootstrap': 'False',
  'max_depth': 9733,
  'max_features': 'auto',
  'max_leaf_nodes': 4730,
  'n_estimators': 700,
  'min_samples_split': 3,
  'min_samples_leaf': 8},] 

# Training

In [53]:
"""
    Creating a dictionary named fin_pred which will store the predictions for the test data. 
    Then, for each unique store and day, you are selecting the rows for that store and day, dropping the 'id' 
    and 'transactions' columns, splitting the data into train and validation sets, 
    fitting a random forest regressor on the train data and making predictions on the test data. 
    These predictions are then added to the fin_pred dictionary.
"""

# Set the random state for reproducibility
random_state = 0

# Initialize the counters
sm_rf = 0
cnt = 0

# Create a dictionary to store the predictions
fin_pred = {}

# Loop over the unique stores and days
for i in tqdm(df.index.get_level_values(0).unique()):
    for j in df.index.get_level_values(1).unique():
        
        # Get the data
        df_ = df.loc[(i, j)]
        
        # Get the ids
        test_id = df_[df_['sales'].isna()]['id']
        
        # Drop the id and transaction columns
        df_ = df_.drop(columns=['id', 'transactions'])

        # Get the train data
        train = df_[~df_['sales'].isna()]
        
        # Get the test data
        X_test = df_[df_['sales'].isna()].drop(columns=['sales'])
        
        # Split the data into train and validation sets
        X_train, X_val, y_train, y_val = my_split_func(train)

        # Log transform the sales
        y_train = np.log1p(y_train)
        # y_val = np.log1p(y_val)

        # Initialize the model
        model = RandomForestRegressor(**good_p[0], random_state=random_state)
        
        # Get the weights
        weights = get_weights_distribution(5, X_train.index)
        
        # Fit the model
        model.fit(X_train, y_train, sample_weight=weights)

        # Get the predictions
        # preds = model.predict(X_val)
        preds_ = model.predict(X_test)

        # Loop over the predictions
        for q in range(preds_.shape[0]):
            fin_pred[test_id[q]] = preds_[q]
        

100%|██████████| 54/54 [3:10:03<00:00, 211.17s/it]  


# Submission

In [54]:
"""
    This takes in a dictionary of sales predictions with store IDs as keys and
    sales predictions as values, and a dataframe of store IDs and their geographic
    locations, and returns a map of sales predictions by store locations.
"""

ss['sales'] = ss['id'].map(fin_pred)

In [55]:
"""applying the exponential transformation to the sales column."""

ss['sales'] = np.exp(ss['sales']) - 1

In [56]:
"""save the final predictions to the disk as submission.csv file."""

ss.to_csv('submission.csv', index=False)