# Store Sales - Time Series Forecasting
*Use machine learning to predict grocery sales*

In [2166]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid", palette="muted")

RANDOM_STATE = 101


### Data collection

In [2167]:
df_train = pd.read_csv(
    "./data/train.csv",
    usecols=["id", "date", "store_nbr", "family", "sales", "onpromotion"],
    dtype={
        "store_nbr": "category",
        "family": "category",
        "sales": "float32",
        "onpromotion": "uint32",
    },
    parse_dates=["date"],
)

df_train.shape

(3000888, 6)

In [2168]:
pd.concat([df_train.head(2), df_train.tail(2)])

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.0,8
3000887,3000887,2017-08-15,9,SEAFOOD,16.0,0


In [2169]:
df_test = pd.read_csv(
    "./data/test.csv",
    usecols=["id", "date", "store_nbr", "family", "onpromotion"],
    dtype={
        "store_nbr": "category",
        "family": "category",
        "onpromotion": "uint32",
    },
    parse_dates=["date"],
)

df_test.shape

(28512, 5)

In [2170]:
pd.concat([df_test.head(2), df_test.tail(2)])

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9
28511,3029399,2017-08-31,9,SEAFOOD,0


In [2171]:
df_holidays = pd.read_csv(
    "./data/holidays_events.csv",
    # dtype={
    #     "type": "category",
    #     "locale": "category",
    # },
    parse_dates=["date"],
    # index_col="date",
)

df_holidays.shape

(350, 6)

In [2172]:
pd.concat([df_holidays.head(2), df_holidays.tail(2)])

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
348,2017-12-25,Holiday,National,Ecuador,Navidad,False
349,2017-12-26,Additional,National,Ecuador,Navidad+1,False


In [2173]:
df_stores = pd.read_csv(
    "./data/stores.csv",
    dtype={
        "store_nbr": "category",
        "city": "category",
        "state": "category",
        "type": "category",
        "cluster": "category",
    },
)

df_holidays.shape

(350, 6)

In [2174]:
pd.concat([df_stores.head(2), df_stores.tail(2)])

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
52,53,Manta,Manabi,D,13
53,54,El Carmen,Manabi,C,3


In [2175]:
df_oil = pd.read_csv(
    "./data/oil.csv",
    parse_dates=["date"],
    # index_col="date",
)

df_oil.shape

(1218, 2)

In [2176]:
pd.concat([df_oil.head(2), df_oil.tail(2)])

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
1216,2017-08-30,45.96
1217,2017-08-31,47.26


In [2177]:
df_txns = pd.read_csv(
    "./data/transactions.csv",
    dtype={
        "store_nbr": "category",
        "transactions": "uint32",
    },
    parse_dates=["date"],
    # index_col="date",
)

df_txns.shape

(83488, 3)

In [2178]:
pd.concat([df_txns.head(2), df_txns.tail(2)])

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
83486,2017-08-15,53,932
83487,2017-08-15,54,802


### EDA

#### Check nulls

In [2179]:
def show_missing_data(df, df_name):
    """
    Display number and percentage of columns with any missing value
    """
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (
        ((df.isnull().sum() / df.isnull().count()) * 100)
        .sort_values(ascending=False)
        .round(2)
    )
    missing_data = pd.concat([total, percent], axis=1, keys=["# missing", "% missing"])
    
    if missing_data["# missing"].max() > 0:
        print(f'`{df_name}` has null values:')
        print(missing_data[missing_data["# missing"] > 0])
    else:
        print(f'`{df_name}` does not have null values')

In [2180]:
show_missing_data(df_train, 'df_train')
show_missing_data(df_test, 'df_test')
show_missing_data(df_txns, 'df_txns')
show_missing_data(df_holidays, 'df_holidays')
show_missing_data(df_oil, 'df_oil')

`df_train` does not have null values
`df_test` does not have null values
`df_txns` does not have null values
`df_holidays` does not have null values
`df_oil` has null values:
            # missing  % missing
dcoilwtico         43       3.53


### Data cleaning & Preprocessing

#### Manage nulls

In [2181]:
# Careful!! there can be 3 days in a row without values.
# -> TO BE UPDATED TO FIND ITERATIVELLY THE NEAREST VALUE.

def fill_missing_with_surrounding_mean(df, target_column, min_periods=2, decimals=2):
    """
    fill null values on field `target_column` with their surrounding `min_periods` peers
    """
    # Calculate the rolling mean with the specified minimum number of periods
    rolling_mean = (
        df[target_column]
        .rolling(window=min_periods * 2 + 1, min_periods=min_periods, center=True)
        .mean()
    )

    # Round the rolling mean to the specified number of decimal places
    rounded_mean = rolling_mean.round(decimals)

    # Fill missing values in the target column with the rolling mean
    df[target_column].fillna(rounded_mean, inplace=True)

    return df

# Alternatively:
# df_oil['dcoilwtico'].fillna(method='backfill', inplace=True)

### Process dataset

In [2182]:
# df_oil = fill_missing_with_surrounding_mean(df_oil, 'dcoilwtico', 1)
# show_missing_data(df_oil, 'df_oil')

### EDA

In [2183]:
# analyses will be done at (entire) day level, where specific time is not relevant,
# so converting to Period object

# but... according to gpt: in the context of time series modeling, especially when 
# using functions like DeterministicProcess from statsmodels, it's generally more 
# straightforward to keep dates as a DateTime type rather than converting them to
#  a Period. The DateTimeIndex in pandas is more widely supported for time series 
# analysis and makes certain operations more straightforward.

# df_train['date'] = df_train.date.dt.to_period('D')
# df_test['date'] = df_test.date.dt.to_period('D')
# df_holidays['date'] = df_holidays.date.dt.to_period('D')
# df_oil['date'] = df_oil.date.dt.to_period('D')
# df_txns['date'] = df_txns.date.dt.to_period('D')


In [2184]:
# sns.histplot(np.log1p(df_train['sales']), kde=True)

In [2185]:
# df_train['sales'].plot(figsize=(10, 6))
# plt.title('Sales over Time')
# plt.show()

In [2186]:
# plt.figure(figsize=(5,3))
# sns.lineplot(data=df_oil, x='date', y='dcoilwtico')

### Feature engineering

In [2187]:
def lag_features(df, lags):
    for lag in lags:
        # Create column 'sales_t-lag' by taking previous values ​​of column 'sales' based on columns 'store_nbr' and 'family'
        df[f"sales_t-{lag}"] = df.groupby(["store_nbr", "family"])["sales"].transform(
            lambda x: x.shift(lag)
        )
    return df

In [2188]:
def fill_na(df):
    df["holiday_type"] = df["holiday_type"].fillna("Common")
    df["locale"] = df["locale"].fillna("Common")
    df["description"] = df["description"].fillna("Unknown")
    df["transferred"] = df["transferred"].fillna(False)
    # TODO: to be replaced by fill_missing_with_surrounding_mean()
    df["dcoilwtico"] = df["dcoilwtico"].fillna(method="backfill")
    df["transactions"] = df["transactions"].fillna(0)

    return df

In [2189]:
def merge_data(df):
    df = (
        df.merge(df_stores, left_on="store_nbr", right_on="store_nbr", how="left")
        .rename(columns={"type": "store_type"})
        .merge(
            df_txns,
            left_on=["date", "store_nbr"],
            right_on=["date", "store_nbr"],
            how="left",
        )
        .merge(df_holidays, left_on="date", right_on="date", how="left")
        .drop_duplicates(subset="id")
        .rename(columns={"type": "holiday_type"})
        .merge(df_oil, left_on="date", right_on="date", how="left")
    )
    return df.drop('id', axis=1)

In [2190]:
def create_date_features(df):
    df["month"] = df["date"].dt.month
    df["day_of_month"] = df["date"].dt.day
    df["day_of_year"] = df["date"].dt.dayofyear
    # df["week_of_year"] = df["date"].dt.isocalendar().week
    df["day_of_week"] = df["date"].dt.dayofweek
    df["year"] = df["date"].dt.year
    return df

In [2191]:
def create_date_features2(df):
    # Convert PeriodIndex to DateTimeIndex
    datetime_index = df.index.to_timestamp()

    # Create date features
    df["month"] = datetime_index.month
    df["day_of_month"] = datetime_index.day
    df["day_of_year"] = datetime_index.dayofyear
    # df["week_of_year"] = datetime_index.isocalendar().week.astype('int')
    df["day_of_week"] = datetime_index.dayofweek
    df["year"] = datetime_index.year
    df["new_year"] = datetime_index.dayofyear == 1

    return df

In [2192]:
def random_noise(dataframe):
    """
    Generate random noise with a normal distribution (mean=0, std=2)
    matching the length of the dataframe.
    """
    return np.random.normal(scale=2.0, size=(len(dataframe),))

In [2193]:
def roll_mean_features(dataframe, windows):
    """
    Calculate rolling mean with triangular window and custom
    noise for 'sales' within 'store_nbr' and 'family' groups.
    """
    def add_noise(x):
        return x + np.random.normal(size=len(x))

    for window in windows:
        dataframe["sales_roll_mean_" + str(window)] = dataframe.groupby(
            ["store_nbr", "family"]
        )["sales"].transform(
            lambda x: x.shift(16)
            .rolling(window=window, min_periods=7, win_type="triang")
            .mean()
        )
        dataframe["sales_roll_mean_" + str(window)] = dataframe.groupby(
            ["store_nbr", "family"]
        )["sales_roll_mean_" + str(window)].transform(add_noise)
    return dataframe

In [2194]:
def ewma_features(dataframe, alphas, lags):
    """
    Create Exponentially Weighted Moving Average (EWMA) features for 'sales'
    with specified alphas and lags, grouped by 'store_nbr' and 'family'.
    """
    for alpha in alphas:
        for lag in lags:
            feature_name = (
                "sales_ewm_alpha_" + str(alpha).replace(".", "") + "_lag_" + str(lag)
            )
            dataframe[feature_name] = dataframe.groupby(["store_nbr", "family"])[
                "sales"
            ].transform(lambda x: x.shift(lag).ewm(alpha=alpha, min_periods=1).mean())
    return dataframe

In [2195]:
# choose relevant features for regression models
col = [
    "date",
    "store_nbr",
    "family",
    "sales",
    "onpromotion",
    "cluster",
    "holiday_type",
    "locale",
    # "description",
    "transferred",
    "dcoilwtico",
]

In [2196]:
df_train['sales'] = np.log1p(df_train['sales'])

df_both = pd.concat([df_train, df_test], axis=0)
df_both = merge_data(df_both)
df_both = fill_na(df_both)
df_both = create_date_features(df_both)
# df_both = lag_features(
#     df_both, lags=[*range(1, 16), 16, 17, 18, 19, 20, 21, 22, 30, 31, 90, 180, 364]
# )
# df_both = roll_mean_features(df_both, [16, 17, 18, 30])
df_both = df_both[col]

  df["dcoilwtico"] = df["dcoilwtico"].fillna(method="backfill")


In [2197]:
# alphas = [0.95, 0.9, 0.8, 0.5]
# lags =[1, 7,30]
# all_df = ewma_features(df_all, alphas, lags)

In [2198]:
# df_all["store_nbr"] = df_all["store_nbr"].astype("category")
# df_all["family"] = df_all["family"].astype("category")
# df_all["store_nbr"] = df_all["store_nbr"].astype("category")
# df_all["cluster"] = df_all["cluster"].astype("category")
# df_all["family"] = df_all["family"].astype("category")
# df_all["holiday_type"] = df_all["holiday_type"].astype("category")
# df_all["locale"] = df_all["locale"].astype("category")
# df_all["description"] = df_all["description"].astype("category")

In [2199]:
# df_both = df_both[df_both['date'] > '2013-12-31']
# X = df_both[df_both['date'] <= '2017-08-15'].drop('date', axis=1)
# X_test = df_both[df_both['date'] > '2017-08-15'].drop('date', axis=1)
# corr = X.select_dtypes(include=['number']).corr()
# corr["sales"].sort_values(ascending=False)
# y = X['sales']
# X = X.drop(['sales'], axis=1)
# X_test = X_test.drop(['sales'], axis=1)


In [2200]:
# TBC: start from 2014 to have the preceeding Y-1 ?
df_both_filtered = df_both[df_both['date'] > '2013-12-31']

### Encoding categorical data

In [2201]:
def encode_features(df):
    """Encode categorical features"""
    columns_to_encode = df.select_dtypes(
        include=["object", "category"]
    ).columns.tolist()
    encoded_df_train = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)

    # replace spaces in df columns for lightxgb model
    encoded_df_train.columns = encoded_df_train.columns.str.replace(" ", "_")

    return encoded_df_train

In [2202]:
df_both_encoded = encode_features(df_both_filtered)

In [2203]:
# X = df_all_encoded[df_all_encoded["date"] <= "2017-08-15"].drop(
#     ["sales", "date"], axis=1
# )
# y = df_all_encoded[df_all_encoded["date"] <= "2017-08-15"]["sales"]

df_train_encoded = df_both_encoded[df_both_encoded["date"] <= "2017-08-15"]
df_test_encoded = df_both_encoded[df_both_encoded["date"] > "2017-08-15"]

In [2204]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=RANDOM_STATE
# )

In [2205]:
def rmsle(y_true_log, y_pred_log):
    # """
    # Compute Root Mean Squared Logarithmic Error for log-transformed targets.
    # Parameters:
    # - y_true_log: Log-transformed actual values
    # - y_pred_log: Log-transformed predicted values
    # """
    return np.sqrt(mean_squared_error(y_true_log, y_pred_log))

### Model v2

### Pre-processing

In [2206]:
# convert field `date` from type datetime64[ns] to period[D]
df_train['date'] = df_train.date.dt.to_period('D')

# remove 'id'ArithmeticError
df_train.drop('id', axis=1, inplace=True)

# create index
df_train = df_train.set_index(["store_nbr", "family", "date"]).sort_index()

df_train.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2013-01-01,0.0,0
1,AUTOMOTIVE,2013-01-02,1.098612,0


### Feature engineering

In [2207]:
"""
1) Unstack converts levels of a multi-index into column headers, transforming the
data from a long to a wide format.
2) if the index is a datetime or period index, you can use .loc[] with a string
that representsa date or a part of a date (like just the year) to filter the data.
"""
y = df_train.unstack(['store_nbr', 'family']).loc["2017"]
y.head(2)

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,...,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
store_nbr,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2017-01-02,1.791759,0.0,0.0,7.26892,0.0,5.122886,0.0,5.808143,5.932245,3.828207,...,0,0,13,0,0,2,1,4,0,0


In [2208]:
"""
Create seasonal features using DeterministicProcess. In this case, 2 seasonal features:
1) Weekly -> seasonal=True
2) Monthly -> CalendarFourier
"""
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

fourier = CalendarFourier(
    freq="M", order=3
)  # 3 sin/cos pairs for "M"onthly seasonality

dp = DeterministicProcess(
    index=y.index,
    constant=False,  # dummy feature for bias (y-intercept)
    order=5,  # 5th degree polynomial trend
    seasonal=True,  # weekly seasonality (indicators), because date is at day level
    additional_terms=[fourier],  # monthly seasonality (fourier)
    drop=True,  # drop terms to avoid collinearity
)

X = dp.in_sample()

In [2209]:
X.head(2)

Unnamed: 0_level_0,trend,trend_squared,trend_cubed,trend**4,trend**5,"s(1,7)","s(2,7)","s(3,7)","s(4,7)","s(5,7)","s(6,7)","s(7,7)","sin(1,freq=M)","cos(1,freq=M)","sin(2,freq=M)","cos(2,freq=M)","sin(3,freq=M)","cos(3,freq=M)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-01,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2017-01-02,2.0,4.0,8.0,16.0,32.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.201299,0.97953,0.394356,0.918958,0.571268,0.820763


In [2220]:
X = create_date_features2(X)
X.head(3)

Unnamed: 0_level_0,trend,trend_squared,trend_cubed,trend**4,trend**5,"s(1,7)","s(2,7)","s(3,7)","s(4,7)","s(5,7)",...,"sin(2,freq=M)","cos(2,freq=M)","sin(3,freq=M)","cos(3,freq=M)",month,day_of_month,day_of_year,day_of_week,year,new_year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1,1,1,6,2017,True
2017-01-02,2.0,4.0,8.0,16.0,32.0,0.0,1.0,0.0,0.0,0.0,...,0.394356,0.918958,0.571268,0.820763,1,2,2,0,2017,False
2017-01-03,3.0,9.0,27.0,81.0,243.0,0.0,0.0,1.0,0.0,0.0,...,0.724793,0.688967,0.937752,0.347305,1,3,3,1,2017,False


### Feature engineering

In [2211]:
# # Create date features in df_train
# df_train = create_date_features_from_index(df_train)

# # Ensure df_train is indexed by 'date' to match X
# df_train_reset = df_train.reset_index().set_index('date')

# # Merge X with the modified df_train
# X = X.merge(
#     df_train_reset[
#         ["month", "day_of_month", "day_of_year", "week_of_year", "day_of_week", "year"]
#     ],
#     left_index=True, 
#     right_index=True,
#     how="left"
# )

# # Check for NaNs after merging
# print("NaNs in X after merge:", X.isna().sum().sum())

In [2212]:
y.info()

<class 'pandas.core.frame.DataFrame'>
PeriodIndex: 227 entries, 2017-01-01 to 2017-08-15
Freq: D
Columns: 3564 entries, ('sales', '1', 'AUTOMOTIVE') to ('onpromotion', '9', 'SEAFOOD')
dtypes: float32(1782), uint32(1782)
memory usage: 3.1 MB


### Split Train/Test

In [2213]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=RANDOM_STATE
)

In [2214]:
from sklearn.preprocessing import RobustScaler

transformerL = RobustScaler().fit(X_train)

X_test = transformerL.transform(X_test)
X_train = transformerL.transform(X_train)

In [2215]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def evaluate_models(regressors, X, y):
    results = []
    for regressor in regressors:
        y_pred = regressor.predict(X)
        # TODO: shouldn't happen, but...
        y_pred[y_pred < 0] = 0
        mae = mean_absolute_error(y, y_pred)
        rmse = np.sqrt(mean_squared_error(y, y_pred))
        results.append(
            {
                "Model": type(regressor).__name__,
                "MAE": mae,
                "RMSE": rmse,
            }
        )
    sorted_results = sorted(results, key=lambda x: x["RMSE"], reverse=False)
    print(
        f"{'Model':<25} {'RMSE':<11} {'MAE':<8}"
    )
    print("-" * 72)
    for result in sorted_results:
        metrics = f"{result['RMSE']:<11.5f} {result['MAE']:<8.2f}"
        print(f"{result['Model']:<25} {metrics}")

### Lasso & Ridge

In [2216]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn import metrics

lasso = Lasso(alpha=1, fit_intercept=True, max_iter=7000).fit(X_train, y_train)

ridge = Ridge(alpha=0.4, fit_intercept=True, max_iter=7000).fit(X_train, y_train)


  model = cd_fast.enet_coordinate_descent(


In [2217]:
models = [ridge, lasso]
evaluate_models(models, X_test, y_test)

Model                     RMSE        MAE     
------------------------------------------------------------------------
Ridge                     3.21603     1.05    
Lasso                     3.67274     1.34    


Model                     RMSE        MAE     
------------------------------------------------------------------------
Ridge                     3.23429     1.06    
Lasso                     3.67253     1.34   

In [2218]:
"""
Based on your description, df_test has 28,512 records, spanning dates from 2017-08-16 to 2017-08-31,
with approximately 1,782 records per day. This structure suggests that for each day, there are 1,782
combinations of 'store_nbr' and 'family' products. Given that there are 16 days in your test date range,
this aligns with the total record count (16 days * 1,782 records/day = 28,512 records).
"""

"\nBased on your description, df_test has 28,512 records, spanning dates from 2017-08-16 to 2017-08-31,\nwith approximately 1,782 records per day. This structure suggests that for each day, there are 1,782\ncombinations of 'store_nbr' and 'family' products. Given that there are 16 days in your test date range,\nthis aligns with the total record count (16 days * 1,782 records/day = 28,512 records).\n"

In [2219]:
import copy

df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

X_test = dp.out_of_sample(steps=16)
X_test.index.name = 'date'
X_test['NewYear'] = (X_test.index.dayofyear == 1)

X_test_copy = copy.deepcopy(X_test)

X_test = transformerL.transform(X_test)

predictionL = lasso.predict(X_test)
predictionL[predictionL < 0] = 0

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- NewYear
Feature names seen at fit time, yet now missing:
- day_of_month
- day_of_week
- day_of_year
- month
- new_year
- ...


In [None]:
y_submit = pd.DataFrame(np.expm1(predictionL), index=X_test_copy.index, columns=y.columns)
# y_submit = pd.DataFrame(predictionL, index=X_test_copy.index, columns=y.columns)
y_submit = y_submit.stack(['store_nbr', 'family'])
y_submit = y_submit.join(df_test.id).reindex(columns=['id', 'sales'])

In [None]:
y_submit

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,sales
date,store_nbr,family,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-08-16,1,AUTOMOTIVE,3000888,3.283707
2017-08-16,1,BABY CARE,3000889,0.000000
2017-08-16,1,BEAUTY,3000890,2.689846
2017-08-16,1,BEVERAGES,3000891,2004.868932
2017-08-16,1,BOOKS,3000892,0.320290
...,...,...,...,...
2017-08-31,9,POULTRY,3029395,418.607620
2017-08-31,9,PREPARED FOODS,3029396,117.379339
2017-08-31,9,PRODUCE,3029397,1579.561665
2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,3029398,3.194469


In [None]:
# y_submit.to_csv('submission.csv', index=False)

In [None]:
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

# X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=13)

# from sklearn.preprocessing import RobustScaler

# transformerKERAS = RobustScaler().fit(X_train)

# X_val = transformerKERAS.transform(X_val)
# X_train = transformerKERAS.transform(X_train)

# from tensorflow.keras.callbacks import ReduceLROnPlateau
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=20, min_lr=0.000001, verbose=1, mode='min')

# def create_model():

#     model = Sequential()
#     model.add(Dense(units=500, activation='relu', input_dim=19))
#     model.add(Dense(units=2000, activation='relu'))
#     model.add(Dense(units=1500, activation='relu'))
#     model.add(Dense(units=1782, activation='swish'))

#     model.compile(loss='mae', optimizer='adam')
    
#     return model

# model = create_model()
# model.fit(X_train, y_train, epochs=5000, batch_size=2000, validation_data=(X_val, y_val),callbacks=[reduce_lr])

# y_pred1 = model.predict(X_val)
# #print(model1.score(X_val, y_val))

# y_pred1[y_pred1 < 0] = 0

# print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred1))
# print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred1)))

# print('======')