# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Loading Dataset

In [2]:
holidays_events = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv", parse_dates=['date'])
oil = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv", parse_dates=['date'])
stores = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
transactions = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv", parse_dates=['date'])

test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv", parse_dates=['date'])
train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv", parse_dates=['date'])

# Preparing Dataset

In [3]:
families = train["family"].unique()
print(f"Unique families: {families}")

Unique families: ['AUTOMOTIVE' 'BABY CARE' 'BEAUTY' 'BEVERAGES' 'BOOKS' 'BREAD/BAKERY'
 'CELEBRATION' 'CLEANING' 'DAIRY' 'DELI' 'EGGS' 'FROZEN FOODS' 'GROCERY I'
 'GROCERY II' 'HARDWARE' 'HOME AND KITCHEN I' 'HOME AND KITCHEN II'
 'HOME APPLIANCES' 'HOME CARE' 'LADIESWEAR' 'LAWN AND GARDEN' 'LINGERIE'
 'LIQUOR,WINE,BEER' 'MAGAZINES' 'MEATS' 'PERSONAL CARE' 'PET SUPPLIES'
 'PLAYERS AND ELECTRONICS' 'POULTRY' 'PREPARED FOODS' 'PRODUCE'
 'SCHOOL AND OFFICE SUPPLIES' 'SEAFOOD']


In [4]:
def get_time_features():
    calendar = pd.DataFrame(index=pd.date_range('2013-01-01', '2017-08-31'))
    calendar['year'] = calendar.index.year.astype('int')
    calendar['quarter'] = calendar.index.quarter.astype('int')
    calendar['month'] = calendar.index.month.astype('int')
    calendar['day'] = calendar.index.day.astype('int')
    calendar['dayofweek'] = calendar.index.day_of_week.astype('int')
    calendar['weekofyear'] = calendar.index.week.astype('int')
    calendar['isweekend'] = calendar.dayofweek.apply(lambda x: 1 if x in (5,6) else 0)
    calendar['startschool'] = calendar.month.apply(lambda x: 1 if x in (4,5,8,9) else 0)

    calendar['daysinmonth'] = calendar.index.days_in_month.astype('int')

    calendar.index.rename("date", inplace=True)
    calendar = pd.get_dummies(calendar, columns=['year'], drop_first=True)
    calendar = pd.get_dummies(calendar, columns=['quarter'], drop_first=True)
    calendar = pd.get_dummies(calendar, columns=['dayofweek'], drop_first=True)

    fourierA = CalendarFourier(freq='A', order=5)
    fourierM = CalendarFourier(freq='M', order=2)
    fourierW = CalendarFourier(freq='W', order=4)

    dp = DeterministicProcess(index=calendar.index,
                              order=1,
                              seasonal=True,
                              constant=False,
                              additional_terms=[fourierA, fourierM, fourierW],
                              drop=True)
    dp_df = dp.in_sample()
    calendar = pd.concat([calendar, dp_df], axis=1)
    return calendar
    

In [5]:
def get_oil_features(calendar):
    oil_df = oil.copy()
    oil_df = pd.merge(calendar.reset_index(), oil_df, left_on='date', how='left', right_on='date')
    oil_df.fillna(method='bfill', inplace=True)

    moving_average_periods = [7, 14, 30, 120, 180, 365]
    for mv in moving_average_periods:
        oil_df[f'mavg_oil_{mv}'] = oil_df['dcoilwtico'].rolling(mv).mean()

    for i in [1, 2, 3, 4, 5, 6, 7, 10, 14, 21, 30, 60, 90]:
        oil_df['lagoil_' + str(i) ] = oil_df['dcoilwtico'].shift(i)
    oil_df.dropna(inplace=True)
    oil_df["date"] = oil_df["date"].dt.to_period('D')
    oil_df = oil_df.set_index("date")
    return oil_df

In [6]:
def get_family_sales(dataframe):
    df = dataframe.copy()
    df['date'] = df.date.dt.to_period('D')
    df = df.set_index(['store_nbr', 'family', 'date']).sort_index()
    return df.groupby(['family', 'date']).mean().drop(columns=["id", "onpromotion"])

In [7]:
calendar = get_time_features()
calendar

  


Unnamed: 0_level_0,month,day,weekofyear,isweekend,startschool,daysinmonth,year_2014,year_2015,year_2016,year_2017,...,"sin(1,freq=M)","cos(1,freq=M)","sin(2,freq=M)","cos(2,freq=M)","sin(1,freq=W-SUN)","cos(1,freq=W-SUN)","sin(2,freq=W-SUN)","cos(2,freq=W-SUN)","cos(3,freq=W-SUN)","sin(4,freq=W-SUN)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,1,1,1,0,0,31,0,0,0,0,...,0.000000,1.000000,0.000000,1.000000,0.781831,0.623490,0.974928,-0.222521,-0.900969,-0.433884
2013-01-02,1,2,1,0,0,31,0,0,0,0,...,0.201299,0.979530,0.394356,0.918958,0.974928,-0.222521,-0.433884,-0.900969,0.623490,0.781831
2013-01-03,1,3,1,0,0,31,0,0,0,0,...,0.394356,0.918958,0.724793,0.688967,0.433884,-0.900969,-0.781831,0.623490,-0.222521,-0.974928
2013-01-04,1,4,1,0,0,31,0,0,0,0,...,0.571268,0.820763,0.937752,0.347305,-0.433884,-0.900969,0.781831,0.623490,-0.222521,0.974928
2013-01-05,1,5,1,1,0,31,0,0,0,0,...,0.724793,0.688967,0.998717,-0.050649,-0.974928,-0.222521,0.433884,-0.900969,0.623490,-0.781831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-27,8,27,34,1,1,31,0,0,0,1,...,-0.848644,0.528964,-0.897805,-0.440394,-0.781831,0.623490,-0.974928,-0.222521,-0.900969,0.433884
2017-08-28,8,28,35,0,1,31,0,0,0,1,...,-0.724793,0.688967,-0.998717,-0.050649,0.000000,1.000000,0.000000,1.000000,1.000000,0.000000
2017-08-29,8,29,35,0,1,31,0,0,0,1,...,-0.571268,0.820763,-0.937752,0.347305,0.781831,0.623490,0.974928,-0.222521,-0.900969,-0.433884
2017-08-30,8,30,35,0,1,31,0,0,0,1,...,-0.394356,0.918958,-0.724793,0.688967,0.974928,-0.222521,-0.433884,-0.900969,0.623490,0.781831


In [8]:
oil_df = get_oil_features(calendar)
oil_df

Unnamed: 0_level_0,month,day,weekofyear,isweekend,startschool,daysinmonth,year_2014,year_2015,year_2016,year_2017,...,lagoil_4,lagoil_5,lagoil_6,lagoil_7,lagoil_10,lagoil_14,lagoil_21,lagoil_30,lagoil_60,lagoil_90
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-12-31,12,31,1,0,0,31,0,0,0,0,...,99.94,99.18,99.18,98.87,98.62,96.99,98.32,93.61,94.56,104.15
2014-01-01,1,1,1,0,0,31,1,0,0,0,...,98.90,99.94,99.18,99.18,98.62,97.59,97.25,93.61,94.58,103.29
2014-01-02,1,2,1,0,0,31,1,0,0,0,...,98.90,98.90,99.94,99.18,98.62,98.40,97.21,95.83,94.58,103.83
2014-01-03,1,3,1,0,0,31,1,0,0,0,...,98.90,98.90,98.90,99.94,98.87,99.11,96.27,96.97,94.58,103.07
2014-01-04,1,4,1,1,0,31,1,0,0,0,...,98.17,98.90,98.90,98.90,99.18,98.62,97.18,97.14,93.40,103.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-27,8,27,34,1,1,31,0,0,0,1,...,48.45,47.65,47.39,47.39,47.07,47.59,49.37,49.72,44.74,49.63
2017-08-28,8,28,35,0,1,31,0,0,0,1,...,47.24,48.45,47.65,47.39,48.59,47.59,49.37,50.21,44.88,49.63
2017-08-29,8,29,35,0,1,31,0,0,0,1,...,47.65,47.24,48.45,47.65,47.39,47.57,49.07,50.21,46.02,48.29
2017-08-30,8,30,35,0,1,31,0,0,0,1,...,46.40,47.65,47.24,48.45,47.39,46.80,49.59,50.21,45.11,48.32


In [9]:
family_sales = get_family_sales(train)
family_sales

Unnamed: 0_level_0,Unnamed: 1_level_0,sales
family,date,Unnamed: 2_level_1
AUTOMOTIVE,2013-01-01,0.000000
AUTOMOTIVE,2013-01-02,4.722222
AUTOMOTIVE,2013-01-03,2.981481
AUTOMOTIVE,2013-01-04,3.129630
AUTOMOTIVE,2013-01-05,6.333333
...,...,...
SEAFOOD,2017-08-11,23.566963
SEAFOOD,2017-08-12,19.037593
SEAFOOD,2017-08-13,20.704574
SEAFOOD,2017-08-14,17.975556


In [10]:
merged_df = family_sales.reset_index().merge(oil_df.reset_index(), left_on='date', right_on='date')
merged_df = merged_df.set_index("family").sort_index()
merged_df

Unnamed: 0_level_0,date,sales,month,day,weekofyear,isweekend,startschool,daysinmonth,year_2014,year_2015,...,lagoil_4,lagoil_5,lagoil_6,lagoil_7,lagoil_10,lagoil_14,lagoil_21,lagoil_30,lagoil_60,lagoil_90
family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AUTOMOTIVE,2013-12-31,4.425926,12,31,1,0,0,31,0,0,...,99.94,99.18,99.18,98.87,98.62,96.99,98.32,93.61,94.56,104.15
AUTOMOTIVE,2014-06-17,5.055556,6,17,25,0,0,30,1,0,...,107.49,107.20,105.04,105.02,105.09,103.34,104.78,102.95,104.35,100.71
AUTOMOTIVE,2016-05-11,5.833333,5,11,19,0,1,31,0,0,...,43.45,44.58,44.33,43.77,44.75,45.29,42.72,40.46,37.20,26.19
AUTOMOTIVE,2015-04-06,5.314815,4,6,15,0,1,30,0,1,...,49.13,50.12,47.72,48.66,48.83,47.40,43.93,49.95,50.48,47.98
AUTOMOTIVE,2016-05-12,5.055556,5,12,19,0,1,31,0,0,...,43.45,43.45,44.58,44.33,44.75,46.03,43.18,42.12,37.20,29.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SEAFOOD,2017-04-25,14.884852,4,25,17,0,1,30,0,0,...,49.64,50.26,50.49,52.46,52.62,53.38,50.99,47.02,53.99,52.14
SEAFOOD,2014-04-21,18.443093,4,21,17,0,1,30,1,0,...,104.33,103.71,103.70,104.05,103.68,100.43,101.57,100.05,103.20,94.51
SEAFOOD,2016-10-13,15.217407,10,13,41,0,0,31,0,0,...,49.76,49.76,49.76,50.44,48.80,47.72,46.10,44.91,45.72,45.93
SEAFOOD,2015-01-07,24.158204,1,7,2,0,0,31,0,1,...,50.05,52.72,52.72,53.45,53.46,55.70,56.43,63.13,77.43,85.76


In [11]:
test_df = test.copy()
test_df["date"] = test_df["date"].dt.to_period('D')
test_df = test_df.drop(columns=["id", "store_nbr", "onpromotion"])
test_df = test_df.drop_duplicates()
test_df = test_df.merge(oil_df.reset_index(), left_on='date', right_on='date').set_index(["family", "date"]).sort_index()
test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,month,day,weekofyear,isweekend,startschool,daysinmonth,year_2014,year_2015,year_2016,year_2017,...,lagoil_4,lagoil_5,lagoil_6,lagoil_7,lagoil_10,lagoil_14,lagoil_21,lagoil_30,lagoil_60,lagoil_90
family,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AUTOMOTIVE,2017-08-16,8,16,33,0,1,31,0,0,0,1,...,47.59,48.81,48.54,49.59,49.37,49.60,48.58,46.02,44.24,49.36
AUTOMOTIVE,2017-08-17,8,17,33,0,1,31,0,0,0,1,...,47.59,47.59,48.81,48.54,49.37,49.03,49.05,46.40,44.24,50.32
AUTOMOTIVE,2017-08-18,8,18,33,0,1,31,0,0,0,1,...,47.59,47.59,47.59,48.81,49.07,49.57,49.72,47.10,44.24,50.81
AUTOMOTIVE,2017-08-19,8,19,33,1,1,31,0,0,0,1,...,47.57,47.59,47.59,47.59,49.59,49.37,50.21,46.73,43.34,50.81
AUTOMOTIVE,2017-08-20,8,20,33,1,1,31,0,0,0,1,...,46.80,47.57,47.59,47.59,48.54,49.37,50.21,45.78,42.48,50.81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SEAFOOD,2017-08-27,8,27,34,1,1,31,0,0,0,1,...,48.45,47.65,47.39,47.39,47.07,47.59,49.37,49.72,44.74,49.63
SEAFOOD,2017-08-28,8,28,35,0,1,31,0,0,0,1,...,47.24,48.45,47.65,47.39,48.59,47.59,49.37,50.21,44.88,49.63
SEAFOOD,2017-08-29,8,29,35,0,1,31,0,0,0,1,...,47.65,47.24,48.45,47.65,47.39,47.57,49.07,50.21,46.02,48.29
SEAFOOD,2017-08-30,8,30,35,0,1,31,0,0,0,1,...,46.40,47.65,47.24,48.45,47.39,46.80,49.59,50.21,45.11,48.32


# Modeling

In [12]:
RF_param = {
    'criterion': 'squared_error',
    'bootstrap': 'False',
    'max_depth': 9733,
    'max_features': 'auto',
    'max_leaf_nodes': 4730,
    'n_estimators': 159,
    'min_samples_split': 3,
    'min_samples_leaf': 8
}
default_models = {
    "linear_reg": LinearRegression(),
    "random_forest": RandomForestRegressor(**RF_param, random_state=0),
    "xgb": XGBRegressor(n_estimators=500)
}

def get_model(name):
    return default_models.get(name, default_models.get("random_forest"))

In [13]:
def train_model(X, y, model):
    X_train, X_val = train_test_split(X, test_size=0.1, shuffle=False)
    y_train, y_val = train_test_split(y, test_size=0.1, shuffle=False)
    
    model.fit(X_train, y_train)
    pred_train = model.predict(X_train).clip(0.0)
    pred_val = model.predict(X_val).clip(0.0)
    train_perf = mean_squared_log_error(y_train, pred_train)
    val_perf = mean_squared_log_error(y_val, pred_val)
    print(f"train_perf: {train_perf}; val_perf: {val_perf}") 
    
    model.fit(X, y)
    return model
        
      
def get_prediction(X, model):
    preds = pd.DataFrame()
    pred = model.predict(X).clip(0.0)
    return np.exp(pred) - 1

In [14]:
import warnings; warnings.simplefilter('ignore')

In [15]:
all_preds = {}
for family in tqdm(families):
    X = merged_df.loc[family].reset_index().drop(columns=['family', 'date'])
    X_test = test_df.loc[family].reset_index().drop(columns=['date'])
    y = np.log1p(X['sales'])
    X = X.drop(columns=['sales'])
    model = get_model("random_forest")
    train_model(X, y, model)
    all_preds[family] = get_prediction(X_test, model)

  0%|          | 0/33 [00:00<?, ?it/s]

train_perf: 0.002836370285414381; val_perf: 0.01234325117701703


  3%|▎         | 1/33 [00:05<03:11,  5.99s/it]

train_perf: 0.001565459672415841; val_perf: 0.0031449816419343046


  6%|▌         | 2/33 [00:11<02:59,  5.79s/it]

train_perf: 0.003427353362530399; val_perf: 0.0027759407130513655


  9%|▉         | 3/33 [00:17<02:51,  5.71s/it]

train_perf: 0.000661105160317459; val_perf: 0.0029469596132325284


 12%|█▏        | 4/33 [00:23<02:46,  5.74s/it]

train_perf: 0.000489830952152268; val_perf: 0.0006082945449292241


 15%|█▌        | 5/33 [00:25<02:06,  4.53s/it]

train_perf: 0.0012855136935318967; val_perf: 0.00018158720872911739


 18%|█▊        | 6/33 [00:31<02:17,  5.09s/it]

train_perf: 0.007156304390445262; val_perf: 0.019432047802060218


 21%|██        | 7/33 [00:37<02:15,  5.20s/it]

train_perf: 0.0015180822787427665; val_perf: 0.0051566025869239475


 24%|██▍       | 8/33 [00:43<02:20,  5.62s/it]

train_perf: 0.001324227123327964; val_perf: 0.00023155032634680078


 27%|██▋       | 9/33 [00:49<02:17,  5.71s/it]

train_perf: 0.0020851607571535924; val_perf: 0.00026863610545076053


 30%|███       | 10/33 [00:55<02:14,  5.83s/it]

train_perf: 0.0014010399830272034; val_perf: 0.01387818289663156


 33%|███▎      | 11/33 [01:01<02:10,  5.94s/it]

train_perf: 0.0010663704142422057; val_perf: 0.00045433084784689384


 36%|███▋      | 12/33 [01:08<02:07,  6.06s/it]

train_perf: 0.0005633657946104845; val_perf: 0.007355967056909426


 39%|███▉      | 13/33 [01:14<02:02,  6.12s/it]

train_perf: 0.004038012332486539; val_perf: 0.0008840056785970715


 42%|████▏     | 14/33 [01:21<02:02,  6.44s/it]

train_perf: 0.002593893553570102; val_perf: 0.0037681449240614603


 45%|████▌     | 15/33 [01:27<01:53,  6.30s/it]

train_perf: 0.01124034614649347; val_perf: 0.021322011748408438


 48%|████▊     | 16/33 [01:33<01:46,  6.25s/it]

train_perf: 0.006975756002309805; val_perf: 0.03440193880502169


 52%|█████▏    | 17/33 [01:39<01:39,  6.22s/it]

train_perf: 0.0019884882790415907; val_perf: 0.004378995594383105


 55%|█████▍    | 18/33 [01:45<01:31,  6.09s/it]

train_perf: 0.009465891980087779; val_perf: 0.040877848026927434


 58%|█████▊    | 19/33 [01:51<01:22,  5.89s/it]

train_perf: 0.00675085768162422; val_perf: 0.020490084686854246


 61%|██████    | 20/33 [01:56<01:13,  5.68s/it]

train_perf: 0.0041662790404152945; val_perf: 0.003956219917093607


 64%|██████▎   | 21/33 [02:02<01:08,  5.75s/it]

train_perf: 0.002699148546785242; val_perf: 0.008953882397006856


 67%|██████▋   | 22/33 [02:07<01:02,  5.70s/it]

train_perf: 0.030357907544144285; val_perf: 0.03796556134140852


 70%|██████▉   | 23/33 [02:13<00:58,  5.87s/it]

train_perf: 0.005238467871588082; val_perf: 0.013322435525815171


 73%|███████▎  | 24/33 [02:19<00:51,  5.78s/it]

train_perf: 0.0018573017164648905; val_perf: 0.003613418405143675


 76%|███████▌  | 25/33 [02:25<00:46,  5.86s/it]

train_perf: 0.002840121968178886; val_perf: 0.0005715064465923886


 79%|███████▉  | 26/33 [02:31<00:41,  5.98s/it]

train_perf: 0.004329748739384657; val_perf: 0.019312122134849025


 82%|████████▏ | 27/33 [02:37<00:34,  5.74s/it]

train_perf: 0.006976503614697476; val_perf: 0.02400081604949962


 85%|████████▍ | 28/33 [02:42<00:27,  5.59s/it]

train_perf: 0.002477688172662764; val_perf: 0.00018618042303505697


 88%|████████▊ | 29/33 [02:48<00:23,  5.88s/it]

train_perf: 0.003002533277402836; val_perf: 0.000334760911433276


 91%|█████████ | 30/33 [02:54<00:17,  5.97s/it]

train_perf: 0.005613803211392335; val_perf: 0.019430688981244854


 94%|█████████▍| 31/33 [03:00<00:11,  5.86s/it]

train_perf: 0.006731401985292704; val_perf: 0.01776781019772799


 97%|█████████▋| 32/33 [03:06<00:05,  5.76s/it]

train_perf: 0.004678626467285942; val_perf: 0.0009233518564971678


100%|██████████| 33/33 [03:12<00:00,  5.83s/it]


In [16]:
test_pred = test.copy()
test_pred = test_pred.set_index(['store_nbr', 'family'])

test_pred

Unnamed: 0_level_0,Unnamed: 1_level_0,id,date,onpromotion
store_nbr,family,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,3000888,2017-08-16,0
1,BABY CARE,3000889,2017-08-16,0
1,BEAUTY,3000890,2017-08-16,2
1,BEVERAGES,3000891,2017-08-16,20
1,BOOKS,3000892,2017-08-16,0
...,...,...,...,...
9,POULTRY,3029395,2017-08-31,1
9,PREPARED FOODS,3029396,2017-08-31,0
9,PRODUCE,3029397,2017-08-31,1
9,SCHOOL AND OFFICE SUPPLIES,3029398,2017-08-31,9


In [17]:
predictions = pd.DataFrame()

for index in tqdm(test_pred.index.unique()):
    df = test_pred.loc[index].reset_index().drop(columns=['date','onpromotion', 'family', 'store_nbr'])
    df = df.set_index('id')
    df['sales'] = all_preds[index[1]]
    predictions = pd.concat([predictions, df], axis=0)
    
predictions = predictions.sort_index()
predictions

100%|██████████| 1782/1782 [00:05<00:00, 310.89it/s]


Unnamed: 0_level_0,sales
id,Unnamed: 1_level_1
3000888,6.099217
3000889,0.155762
3000890,5.712249
3000891,3095.881590
3000892,0.017292
...,...
3029395,273.785656
3029396,81.532507
3029397,1660.206415
3029398,39.290185


# Submission

In [18]:
predictions.reset_index().to_csv('/kaggle/working/rf_per_fam_avg.csv', index=False)