In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
SUB = False

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv', parse_dates=['date'])
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv', parse_dates=['date'])
submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.shape, test_df.shape

In [None]:
def train_test_split(df, n):
    if isinstance(n, float):
        n = int(df.shape[0] * n)
    
    return df.iloc[0:df.shape[0] - n], df.iloc[df.shape[0] - n:]


def split_X_y(df, target='num_sold'):
    return df.drop(target, axis=1), df[target]


def add_datepart(X, feat):
            
    X[feat] = pd.to_datetime(X[feat])
    attrs = ['year', 'month', 'day', 'dayofweek', 'quarter', 'dayofyear']
    for attr in attrs:
        X[attr] = getattr(X[feat].dt, attr.lower())
        
    return X.drop(feat, axis=1)

In [None]:
train_df = add_datepart(train_df, 'date')

### Features I think can improve predictions:
1. Sales last year today
2. Total sales last year
3. Min sales last year
4. Max sales last year
5. Average sales last year
6. Average quarter sales last year
7. Average monthly sales last year

### I think playing on those features can improve the model performance in addition to using the features of different products with each other.

In [None]:
year_sales = train_df.groupby(['country', 'store', 'product', 'year'])['num_sold']
mean_year_sales = year_sales.mean().to_dict()
max_year_sales = year_sales.max().to_dict()
min_year_sales = year_sales.min().to_dict()

month_sales = train_df.groupby(['country', 'store', 'product', 'year', 'month'])['num_sold']
total_month_sales = month_sales.sum().to_dict()
mean_month_sales = month_sales.mean().to_dict()
max_month_sales = month_sales.max().to_dict()
min_month_sales = month_sales.min().to_dict()

day_sales = train_df.groupby(['country', 'store', 'product', 'year', 'month', 'day'])['num_sold']


for i, record in enumerate(train_df[['country', 'store', 'product', 'year', 'month']].to_records(index=False)):
    record[-2] -= 1
    
    if record[-2] >= 2015:
        # Add last year's monthly records
        for m in range(1, 13):
            record[-1] = m
            tuple_record = tuple(record)
            train_df.loc[i, f'total_last_year_{m}_sales'] = total_month_sales[tuple_record]
            train_df.loc[i, f'mean_last_year_{m}_sales'] = mean_month_sales[tuple_record]
            train_df.loc[i, f'max_last_year_{m}_sales'] = max_month_sales[tuple_record]
            train_df.loc[i, f'min_last_year_{m}_sales'] = min_month_sales[tuple_record]        

In [None]:
# for col in train_df.columns:
#     if train_df[col].isna().any():
#         train_df.loc[:, f'{col}_na'] = train_df[col].isna().astype(int)

train_df = train_df.fillna(0)

In [None]:
train_df.tail()

In [None]:
if not SUB:
    dev_df, val_df = train_test_split(train_df, test_df.shape[0])
    print(dev_df.shape, val_df.shape)
    
else:
    dev_df = train_df
    print(dev_df.shape)

In [None]:
# Check no overlapping row_ids
if not SUB:
    assert set(dev_df.row_id).intersection(val_df.row_id) == set()

In [None]:
# Drop 2015 from training set
dev_df = dev_df[dev_df.year > 2015]

In [None]:
X_train, y_train = split_X_y(dev_df)

if not SUB:
    X_val, y_val = split_X_y(val_df)

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


def preprocess_dataset(X, encoders={}):
        
#     X = add_datepart(X, 'date')
    
#     for country in ['Finland', 'Norway', 'Sweden']:
#         X[country + '_prog'] = ((X.row_id // 18) + 1) * (X['country']==country).astype(int)
#         X[country + '_prog^2'] = (X[country + '_prog']**2)
#         X[country + '_prog^3'] = (X[country + '_prog']**3)
    
    feats = ['country', 'store', 'product']
    if not encoders:
        encoders = {feat: LabelEncoder().fit(X[feat]) for feat in feats}
    
    for feat in feats:
        X[feat] = encoders[feat].transform(X[feat])
    
    return X, encoders

In [None]:
X_train, encoders = preprocess_dataset(X_train)

if not SUB:
    X_val, _ = preprocess_dataset(X_val, encoders)

In [None]:
def print_scores(model, X_train, y_train, X_val, y_val, scorer):
    scores = [model.score(X_train, y_train),
              scorer(y_train, model.predict(X_train)),
              model.score(X_val, y_val),
              scorer(y_val, model.predict(X_val))]
    if hasattr(model, 'oob_score_'):
        scores.append(model.oob_score_)
        
    print(scores)
    

    
def SMAPE_score(y_true, y_pred):
    return 100 / len(y_true) * np.sum((np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred)) / 2)))

In [None]:
weights = dev_df.groupby(['country', 'store', 'product'])['row_id'].transform(lambda x: np.arange(1, len(x) + 1)[::-1])
weights = np.exp(-0.007 * weights)

In [None]:
weights.plot();

In [None]:
%%time

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=500, min_samples_leaf=30, max_features=0.9, n_jobs=-1)
model.fit(X_train, y_train, sample_weight=weights)

In [None]:
if not SUB: print_scores(model, X_train, y_train, X_val, y_val, SMAPE_score)

In [None]:
ft_imp = pd.DataFrame({'features': X_train.columns, 
              'score': model.feature_importances_}).sort_values('score', ascending=False)

In [None]:
ft_imp.plot(x='features', y='score', kind='barh', figsize=(15, 10));

### Drop features below 0.01 importance

In [None]:
# model = RandomForestRegressor(n_estimators=500, min_samples_leaf=20, n_jobs=-1)

# ft_keep = ft_imp.query('score > 0.01').features

# model.fit(X_train[ft_keep], y_train)
# if not SUB: print_scores(model, X_train[ft_keep], y_train, X_val[ft_keep], y_val, SMAPE_score)

### The model has improved.

In [None]:
# model = RandomForestRegressor(n_estimators=500, min_samples_leaf=15, n_jobs=-1)

# ft_keep = ft_imp.query('score > 0.01').features

# model.fit(X_train[ft_keep], y_train)
# print_scores(model, X_train[ft_keep], y_train, X_val[ft_keep], y_val, SMAPE_score)

In [None]:
model = RandomForestRegressor(n_estimators=2000, min_samples_leaf=30, n_jobs=-1)

ft_keep = ft_imp.query('score > 0.01').features

model.fit(X_train[ft_keep], y_train, sample_weight=weights)
if not SUB: print_scores(model, X_train[ft_keep], y_train, X_val[ft_keep], y_val, SMAPE_score)

In [None]:
# ft_imp2 = pd.DataFrame({'features': X_train[ft_keep].columns, 
#               'score': model.feature_importances_}).sort_values('score', ascending=False)

In [None]:
# ft_imp2.plot(x='features', y='score', kind='barh', figsize=(15, 10));

## Submission

In [None]:
test_df = add_datepart(test_df, 'date')

In [None]:
for i, record in enumerate(test_df[['country', 'store', 'product', 'year', 'month']].to_records(index=False)):
    record[-2] -= 1
    
    if record[-2] >= 2015:
        # Add last year's monthly records
        for m in range(1, 13):
            record[-1] = m
            tuple_record = tuple(record)
            test_df.loc[i, f'total_last_year_{m}_sales'] = total_month_sales[tuple_record]
            test_df.loc[i, f'mean_last_year_{m}_sales'] = mean_month_sales[tuple_record]
            test_df.loc[i, f'max_last_year_{m}_sales'] = max_month_sales[tuple_record]
            test_df.loc[i, f'min_last_year_{m}_sales'] = min_month_sales[tuple_record] 

In [None]:
# for col in test_df.columns:
#     if train_df[col].isna().any():
#         test_df.loc[:, f'{col}_na'] = test_df[col].isna().astype(int)

test_df = test_df.fillna(0)

In [None]:
X_test, _ = preprocess_dataset(test_df, encoders)
X_test.head()

In [None]:
# X_test.shape[1] == X_train.shape[1]

In [None]:
submission['num_sold'] = model.predict(X_test[ft_keep])
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
# import itertools

# countries = X_train.country.unique()
# stores = X_train.store.unique()
# products = X_train['product'].unique()

# combinations = list(itertools.product(countries, stores, products))

# preds = {}

# for comb in combinations:
#     y_train_avg = y_train[
#         (X_train['country'] == comb[0]) & 
#         (X_train['store'] == comb[1]) & 
#         (X_train['product'] == comb[2])
# #         (X_train['Month'] == 12)
#     ].values[-500:].mean()
# #     print(y_train_avg); break
    
#     preds[comb] = y_train_avg
    

In [None]:
# preds_val = []

# for row in X_val[['country', 'store', 'product']].values:
#     preds_val.append(preds[tuple(row)])
    
# preds_val = np.array(preds_val)