In [None]:
import pandas as pd
import numpy as np

import pickle
import itertools
import gc
import math
import matplotlib.pyplot as plt
import dateutil.easter as easter
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
from datetime import datetime, date, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LinearRegression, HuberRegressor, Ridge, Lasso
from sklearn.compose import TransformedTargetRegressor

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

In [None]:
train_df

In [None]:
train_df.dtypes

In [None]:
train_df.country.value_counts()

In [None]:
train_df.store.value_counts()

In [None]:
train_df['product'].value_counts()

In [None]:
train_df.isnull().values.any()

In [None]:
train_df.isnull().sum()

In [None]:
test_df

In [None]:
test_df.isnull().sum()

In [None]:
for df in [train_df, test_df]:
    df['date'] = pd.to_datetime(df.date)
    df.set_index('date', inplace=True, drop=False)

In [None]:
train_df

In [None]:
train_df.dtypes

In [None]:
test_df

In [None]:
test_df.dtypes

In [None]:
print(train_df.groupby(['country', 'store', 'product']).date.count())
print('FIRST DAY:', train_df.date.min(), 'LAST DAY:', train_df.date.max())
print('NUMBER OF DAYS IN FOUR YEARS:', 365 * 4 + 1)
print(18 * 1461, train_df.shape, train_df.date.isna().sum())

In [None]:
train_df.groupby(['country', 'store', 'product']).num_sold.agg(['min', 'max', 'mean'])

In [None]:
test_df.date.min(), test_df.date.max()

In [None]:
kk = train_df.groupby(['country', 'store', 'product']).num_sold.mean()
kk

In [None]:
kk = kk.unstack(level='store')
kk

In [None]:
kk['KaggleRama:KaggleMart'] = kk.KaggleRama / kk.KaggleMart
kk

In [None]:
kk = train_df.groupby(['country', 'store', 'product', train_df.date.dt.year]).num_sold.mean()
kk

In [None]:
kk = kk.unstack(level='product')
kk

In [None]:
kk['Mugs:Stickers'] = kk['Kaggle Mug'] / kk['Kaggle Sticker']
kk['Hats:Stickers'] = kk['Kaggle Hat'] / kk['Kaggle Sticker']
kk

In [None]:
kk = train_df.groupby(['product', train_df.date.dt.month]).num_sold.mean().unstack(level='product')
kk['Mugs:Stickers'] = kk['Kaggle Mug'] / kk['Kaggle Sticker']
kk['Hat:Stickers'] = kk['Kaggle Hat'] / kk['Kaggle Sticker']
kk

In [None]:
# en = train_df.groupby(['country', 'store', 'product'])
# list(en)

In [None]:
# enum = enumerate(train_df.groupby(['country', 'store', 'product']))
# list(enum)

In [None]:
plt.figure(figsize=(18, 12))
for i, (combi, df) in enumerate(train_df.groupby(['country', 'store', 'product'])):
    ax = plt.subplot(6, 3, i+1, ymargin=0.5)
    ax.hist(df.num_sold, bins=50, color='pink')
#     ax.set_xscale('log')
    ax.set_title(combi)
plt.suptitle('HISTOGRAMS_NUM_SOLD', y=1.03)
plt.tight_layout(h_pad=0.3)
plt.show()

In [None]:
plt.figure(figsize=(18, 12))
for i, (combi, df) in enumerate(train_df.groupby(['country', 'store', 'product'])):
    ax = plt.subplot(6, 3, i+1, ymargin=0.5)
    print(df.num_sold.values.shape, df.num_sold.values)
    ax.plot(df.num_sold)
    ax.set_title(combi)
plt.tight_layout(h_pad=3.0)
plt.suptitle('DAILY SALES FOR 2015-2018', y=1.03)
plt.show()

In [None]:
plt.figure(figsize=(18, 12))
for i, (combi, df) in enumerate(train_df.groupby(['country', 'store', 'product'])):
    ax = plt.subplot(6, 3, i+1, ymargin=0.5)
    ax.bar(range(1, 32),
          df.num_sold[df.date.dt.month==12].groupby(df.date.dt.day).mean(),
          color = ['b'] * 25 + ['orange'] * 6)
    ax.set_title(combi)
    ax.set_xticks(ticks=range(5, 31, 5))
plt.tight_layout(h_pad=3.0)
plt.suptitle('DAILY SALES FOR DECEMBER', y=1.03)
plt.show()

In [None]:
plt.figure(figsize=(18, 12))
for i, (combi, df) in enumerate(train_df.groupby(['country', 'store', 'product'])):
    ax = plt.subplot(6, 3, i+1, ymargin=0.5)
#     print(df.resample('MS').num_sold.sum())
    resampled = df.resample('MS').num_sold.sum()
    ax.bar(range(len(resampled)), resampled)
    ax.set_title(combi)
    ax.set_ylim(resampled.min(), resampled.max())
    ax.set_xticks(range(0, 48, 12), [f'JAN {y}' for y in range(2015, 2019)])
plt.suptitle('MONTHLY SALES FOR 2015-2018', y=1.03)
plt.tight_layout(h_pad=0.3)
plt.show()

In [None]:
plt.figure(figsize=(18, 12))
for i, (combi, df) in enumerate(train_df.groupby(['country', 'store', 'product'])):
    ax = plt.subplot(6, 3, i+1, ymargin=0.5)
    resampled = df.resample('MS').sum()
    resampled = resampled.groupby(resampled.index.month).mean()
    ax.bar(range(1, 13), resampled.num_sold)
    ax.set_xticks(ticks=range(1, 13), labels='JFMAMJJASOND')
    ax.set_title(combi)
    ax.set_ylim(resampled.num_sold.min(), resampled.num_sold.max())
plt.suptitle('MONTHLY SALES FOR 2015-2018', y=1.03)
plt.tight_layout(h_pad=3.0)
plt.show()

In [None]:
plt.figure(figsize=(18, 12))
for i, (combi, df) in enumerate(train_df.groupby(['country', 'store', 'product'])):
    ax = plt.subplot(6, 3, i+1, ymargin=0.5)
    resampled = df.resample('AS').sum()
    ax.bar(range(2015, 2019), resampled.num_sold, color='red')
    ax.set_title(combi)
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax.set_ylim(0, resampled.num_sold.max())
plt.suptitle('ANNUAL GROWTH FOR 2015-2018', y=1.03)
plt.tight_layout(h_pad=3.0)
plt.show()

In [None]:
plt.figure(figsize=(12, 90))
for i, (combi, df) in enumerate(train_df.groupby(['country', 'store', 'product'])):
    ax = plt.subplot(18, 1, i+1, ymargin=0.5)
    resampled = df[(df.date.dt.month<12) | (df.date.dt.day<25)].resample('AS').num_sold.sum()
    resampled /= resampled.iloc[0]
    resampled_end_of_year = df[(df.date.dt.month==12) & (df.date.dt.day>=25)].resample('AS').num_sold.sum()
    resampled_end_of_year /= resampled_end_of_year.iloc[0]
    ax.bar(range(2015, 2019), resampled, color='green')
    ax.bar(range(2015, 2019), resampled_end_of_year, color='yellow', width=0.4)
    X = np.arange(2015, 2019).reshape(-1, 1)
    lr = TransformedTargetRegressor(LinearRegression(), func=np.log, inverse_func=np.exp)
    lr.fit(X, resampled)
    ax.plot(range(2015, 2019), lr.predict(X), color='blue', label=f'ENTIRE YEAR: {lr.predict([[2016]]).squeeze() - 1:.1%}')
    lr.fit(X, resampled_end_of_year)
    ax.plot(range(2015, 2019), lr.predict(X), color='cyan', label=f'END OF YEAR: {lr.predict([[2016]]).squeeze() - 1:.1%}')
    ax.legend()
    ax.set_title(f'ANNUAL SALES FOR {combi}')
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.tight_layout(h_pad=3.0)
plt.show()

In [None]:
plt.figure(figsize=(18, 12))
for i, (combi, df) in enumerate(train_df.groupby(['country', 'store', 'product'])):
    ax = plt.subplot(6, 3, i+1, ymargin=0.5)
    resampled = df.groupby(df.index.dayofweek).mean()
    ax.bar(range(7), resampled.num_sold, color=['pink']*4+['red']+['orange']*2)
    ax.set_title(combi)
    ax.set_xticks(ticks=range(7), labels=['M', 'T', 'W', 'T', 'F', 'S', 'S'])
    ax.set_ylim(0, resampled.num_sold.max())
plt.suptitle('SALES PER DAY OF WEEK', y=1.03)
plt.tight_layout(h_pad=3.0)
plt.show()

In [None]:
plt.figure(figsize=(18, 12))
for i, (year, df) in enumerate(train_df.groupby(train_df.date.dt.year)):
    df = df.reset_index(drop=True)
    ax = plt.subplot(4, 1, i+1, ymargin=0.5)
    april= df.num_sold[(df.date.dt.month==4)].groupby(df.date.dt.day).mean()
    date_range = pd.date_range(start=f'{year}-04-01', end=f'{year}-04-30', freq='D')
    easter_date = easter.easter(year)
    color = ['red' if d == easter_date else 'fuchsia' if (d.date() - easter_date).days in range(6) else 'blue' if d.dayofweek<5 else 'green' for d in date_range]
    ax.bar(range(1, 31), april, color=color)
    ax.set_title(str(year))
    ax.set_xticks(ticks=range(5, 31, 5))
plt.tight_layout(h_pad=3.0)
plt.suptitle('DAILY SALES FOR APRIL', y=1.03)
plt.show()

In [None]:
original_train_df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
original_test_df = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
gdp_df = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')

gdp_df.set_index('year', inplace=True)

for df in [original_train_df, original_test_df]:
    df['date'] = pd.to_datetime(df.date)
    
original_train_df

In [None]:
def smape_loss(y_true, y_pred):
    return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200

In [None]:
def engineer(df):
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]
    
    new_df = pd.DataFrame({
        'gdp': np.log(df.apply(get_gdp, axis=1)),
        'wd4': df.date.dt.weekday == 4,
        'wd56': df.date.dt.weekday >= 5
    })
    
    for country in ['Finland', 'Norway']:
        new_df[country] = df.country == country
        
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        new_df[product] = df['product'] == product
        
    dayofyear = df.date.dt.dayofyear
    
    for k in range(1, 3):
        new_df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Mug']
        new_df[f'hat_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Hat']
        new_df[f'hat_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Hat']
        
    return new_df
   
train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)

features = test_df.columns

for df in [train_df, test_df]:
    df[features] = df[features].astype(np.float32)
print(list(features))

In [None]:
def fit_model(X_tr, X_va=None, outliers=False):
    start_time = datetime.now()
    
    X_tr_f = X_tr[features]
    preproc = StandardScaler()
    X_tr_f = preproc.fit_transform(X_tr_f)
    y_tr = X_tr.num_sold.values.reshape(-1, 1)
    #model = LinearRegression()
    #model = HuberRegressor(epsilon=1.20, max_iter=500)
    model = Ridge()
    model.fit(X_tr_f, np.log(y_tr).ravel())
    if X_va is not None:
        X_va_f = X_va[features]
        X_va_f = preproc.transform(X_va_f)
        y_va = X_va.num_sold.values.reshape(-1, 1)
        
        y_va_pred = np.exp(model.predict(X_va_f)).reshape(-1, 1)
        oof.update(pd.Series(y_va_pred.ravel(), index=X_va.index))
        
        smape_before_correction = np.mean(smape_loss(y_va, y_va_pred))
        #y_va_pred *= LOSS_CORRECTION
        smape = np.mean(smape_loss(y_va, y_va_pred))
        print(f'FOLD {run}.{fold} | {str(datetime.now() - start_time)[-12:-7]}'
              f' | SMAPE: {smape:.5f}    (BEFORE CORRECTION: {smape_before_correction:.5f})')
        score_list.append(smape)
        
        if fold == 0:
            plt.figure(figsize=(10, 10))
            plt.scatter(y_va, y_va_pred, s=1, color='magenta')
            #plt.scatter(np.log(y_va), np.log(y_va_pred), s=1, color='g')
            plt.plot([plt.xlim()[0], plt.xlim()[1]], [plt.xlim()[0], plt.xlim()[1]], '--', color='yellow')
            plt.gca().set_aspect('equal')
            plt.xlabel('y_true')
            plt.ylabel('y_pred')
            plt.title('OOF PREDICTIONS')
            plt.show()
            
    return preproc, model

preproc, model = fit_model(train_df)

def plot_five_years_combination(engineer, country='Norway', store='KaggleMart', product='Kaggle Hat'):
    demo_df = pd.DataFrame({
        'row_id': 0,
        'date': pd.date_range('2015-01-01', '2019-12-31', freq='D'),
        'country': country,
        'store': store,
        'product': product
    })
    demo_df.set_index('date', inplace=True, drop=False)
    demo_df = engineer(demo_df)
    demo_df['num_sold'] = np.exp(model.predict(preproc.transform(demo_df[features])))
    plt.figure(figsize=(20, 6))
    plt.plot(np.arange(len(demo_df)), demo_df.num_sold, label='prediction')
    train_subset = train_df[(original_train_df.country == country) & (original_train_df.store == store) & (original_train_df['product'] == product)]
    plt.scatter(np.arange(len(train_subset)), train_subset.num_sold, label='true', alpha=0.5, color='red', s=3)
    plt.legend()
    plt.title('PREDICTIONS AND TRUE NUM_SOLD FOR FIVE YEARS')
    plt.show()
    
plot_five_years_combination(engineer)

In [None]:
train_df['pred'] = np.exp(model.predict(preproc.transform(train_df[features])))
by_date = train_df.groupby(train_df['date'])
residuals = (by_date.pred.sum() - by_date.num_sold.sum()) / (by_date.pred.sum() + by_date.num_sold.sum()) * 200

def plot_all_residuals(residuals):
    plt.figure(figsize=(20, 6))
    plt.scatter(residuals.index,
                residuals,
                s=1,
                color='m')
    plt.vlines(pd.date_range('2015-01-01', '2019-01-01', freq='M'),
               plt.ylim()[0], 
               plt.ylim()[1], 
               alpha=0.5)
    plt.vlines(pd.date_range('2015-01-01', '2019-01-01', freq='Y'),
               plt.ylim()[0],
               plt.ylim()[1],
               alpha=0.5)
    plt.title('RESIDUALS FOR FOUR YEARS')
    plt.show()
    
plot_all_residuals(residuals)

def plot_around(residuals, m, d, w):
    plt.figure()
    plt.title(f'RESIDUALS AROUND M={m} D={d}')
    
    for y in np.arange(2015, 2020):
        d0 = pd.Timestamp(date(y, m, d))
        residual_range = residuals[(residuals.index > d0 - timedelta(w)) &
                                   (residuals.index < d0 + timedelta(w))]
        plt.plot([(r - d0).days for r in residual_range.index],
                 residual_range,
                 label=str(y))
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.legend()
    plt.show()
    
plot_around(residuals, 1, 1, 20)
plot_around(residuals, 5, 1, 50)
plot_around(residuals, 5, 21, 10)
plot_around(residuals, 5, 31, 15)
plot_around(residuals, 6, 10, 10)
plot_around(residuals, 6, 30, 10)
plot_around(residuals, 11, 5, 10)

In [None]:
def engineer_more(df):
    
    new_df = engineer(df)
    
    new_df = pd.concat([new_df,
                        pd.DataFrame({
                            f'dec{d}': (df.date.dt.month == 12) & (df.date.dt.day == d) for d in range(24, 32)
                        }),
                        pd.DataFrame({
                            f'n-dec{d}': (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway') for d in range(24, 32)
                        }),
                        pd.DataFrame({
                            f'f-jan{d}': (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland') for d  in range(1, 14)
                        }),
                        pd.DataFrame({
                            f'jan{d}': (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway') for d in range(1, 10)
                        }),
                        pd.DataFrame({
                            f's-jan{d}': (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden') for d in range(1, 15)
                        })],
                       axis=1)
    
    new_df = pd.concat([new_df,
                        pd.DataFrame({
                            f'may{d}': (df.date.dt.month == 5) & (df.date.dt.day == d) for d in list(range(1, 10))
                        }),
                        pd.DataFrame({
                            f'may{d}': (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway') for d in list(range(19, 26))
                        })],
                        axis=1)
    
    new_df = pd.concat([new_df,
                       pd.DataFrame({
                           f'june{d}': (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden') for d in list(range(8, 14))
                       }),
                       #pd.DataFrame({
                           #f'june{d}': (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Norway') for d in list(range(22, 31))
                       #}),
                       #pd.DataFrame({
                           #f'july{d}': (df.date.dt.month == 7) & (df.date.dt.day == d) & (df.country == 'Norway') for d in list(range(1, 3))
                       #})
                       ],
                       axis=1)
    
    wed_june_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-06-24')),
        2016: pd.Timestamp(('2016-06-29')),
        2017: pd.Timestamp(('2017-06-28')),
        2018: pd.Timestamp(('2018-06-27')),
        2019: pd.Timestamp(('2019-06-26'))
    })
    
    new_df = pd.concat([new_df,
                        pd.DataFrame({
                            f'wed_june{d}': (df.date - wed_june_date == np.timedelta64(d, 'D')) & (df.country != 'Norway') for d in list(range(-4, 6))
                        })],
                        axis=1)

    sun_nov_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-11-1')),
        2016: pd.Timestamp(('2016-11-6')),
        2017: pd.Timestamp(('2017-11-5')),
        2018: pd.Timestamp(('2018-11-4')),
        2019: pd.Timestamp(('2019-11-3'))
    })
    
    new_df = pd.concat([new_df,
                       pd.DataFrame({
                           f'sun_nov{d}': (df.date - sun_nov_date == np.timedelta64(d, 'D')) & (df.country != 'Norway') for d in list(range(0, 9))
                       })],
                       axis = 1)
    
    new_df = pd.concat([new_df,
                       pd.DataFrame({
                           f'dec{d}': (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland') for d in list(range(6, 14))
                       })],
                       axis = 1)
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    new_df = pd.concat([new_df,
                        pd.DataFrame({
                            f'easter{d}': (df.date - easter_date == np.timedelta64(d, 'D')) for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))
                        })],
                      axis = 1)
    
    return new_df.astype(np.float32)

train_df = engineer_more(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer_more(original_test_df)

features = list(test_df.columns)
print(list(features))

In [None]:
preproc, model = fit_model(train_df)
train_df['pred'] = np.exp(model.predict(preproc.transform(train_df[features])))
with open('train_pred.pickle', 'wb') as handle:
    pickle.dump(train_df.pred, handle)
by_date = train_df.groupby(train_df['date'])
residuals = (by_date.pred.sum() - by_date.num_sold.sum()) / (by_date.pred.sum() + by_date.num_sold.sum()) * 200

plot_five_years_combination(engineer_more)

plot_all_residuals(residuals)

plot_around(residuals, 1, 1, 20)
plot_around(residuals, 5, 1, 50)
plot_around(residuals, 5, 21, 10)
plot_around(residuals, 5, 31, 15)
plot_around(residuals, 6, 10, 10)
plot_around(residuals, 6, 30, 10)
plot_around(residuals, 11, 5, 10)

In [None]:
residuals = np.log(train_df.pred) - np.log(train_df.num_sold)
plt.figure(figsize=(18, 4))
plt.scatter(np.arange(len(residuals)), residuals, s=1)
plt.title('ALL RESIDUALS BY ROW INDEX')
plt.ylabel('residual')
plt.show()
plt.figure(figsize=(18, 4))
plt.hist(residuals, bins=200)
plt.title('HISTOGRAM OF ALL RESIDUALS')
plt.show()
print(f'STANDARD DEVIATION OF LOG RESIDUALS: {residuals.std():.3f}')

In [None]:
train_df['dayfix'] = train_df.date.dt.dayofyear
train_df.loc[(train_df.date.dt.year != 2016) & (train_df.date.dt.month >= 3), 'dayfix'] += 1

from scipy.stats import norm
print('LOOK FOR RESIDUALS BEYOND', norm.ppf([0.5/365, 364.5/365]))

rr = residuals.groupby(train_df.dayfix).mean()
rrstd = rr.std()
print('STANDARD DEVIATION GROUPING BY DAYOFYEAR: {rrstd:.5f}')

rrdf = pd.DataFrame({
    'residual': rr,
    'z_score': rr / rrstd,
    'date': pd.date_range('2016-01-01', '2016-12-31')
})
rrdf[rrdf.z_score.abs() > 3]

In [None]:
rr = residuals.groupby([original_train_df.country, train_df.dayfix]).mean()
rrstd = rr.std()
print('STANDARD DEVIATION GROUPING BY COUNTRY AND DAYOFYEAR: {rrstd:.5f}')
rrdf = pd.DataFrame({
    'residual': rr,
    'z_score': rr / rrstd,
    'date': np.datetime64('2015-12-31') + pd.to_timedelta(rr.index.get_level_values(1), 'D')
})
rrdf[rrdf.z_score.abs() > 3]

In [None]:
%%time
RUNS = 1
OUTLIERS = True
TRAIN_VAL_CUT = datetime(2018, 1, 1) 
LOSS_CORRECTION = 1

np.random.seed(202100)

total_start_time = datetime.now()
oof = pd.Series(0.0, index=train_df.index)
score_list = []
for run in range(RUNS):
    kf = GroupKFold(n_splits=4)
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df, groups=train_df.date.dt.year)):
        X_tr = train_df.iloc[train_idx]
        X_va = train_df.iloc[val_idx]
        print(f'FOLD {run}.{fold}')
        preproc, model = fit_model(X_tr, X_va)
        
print(f'AVERAGE SMAPE: {sum(score_list) / len(score_list):.5f}')
with open('oof.pickle', 'wb') as handle:
    pickle.dump(oof, handle)

In [None]:
train_idx = np.arange(len(train_df))
X_tr = train_df.iloc[train_idx]
preproc, model = fit_model(X_tr, None)

plot_five_years_combination(engineer_more)

test_pred_list = []
test_pred_list.append(np.exp(model.predict(preproc.transform(test_df[features]))) * LOSS_CORRECTION)

sub = original_test_df[['row_id']].copy()
sub['num_sold'] = sum(test_pred_list) / len(test_pred_list)
sub.to_csv('submission_linear_model_01.csv', index=False)

plt.figure(figsize=(16, 3))
plt.hist(train_df['num_sold'], bins=np.linspace(0, 3000, 201), density=True, label='TRAINING')
plt.hist(sub['num_sold'], bins=np.linspace(0, 3000, 201), density=True, rwidth=0.5, label='TEST PREDICTIONS')

plt.xlabel('NUM_SOLD')
plt.ylabel('FREQUENCY')
plt.legend()
plt.show()

In [None]:
sub_rounded = sub.copy()
sub_rounded['num_sold'] = sub_rounded['num_sold'].round()
sub_rounded.to_csv('submission_linear_model_rounded_01.csv', index=False)
sub_rounded

In [None]:
w = pd.Series(model.coef_, features)
ws = w / preproc.scale_

def plot_feature_weights_numbered(prefix):
    prefix_features = [f for f in features if f.startswith(prefix)]
    plt.figure(figsize=(12, 2))
    plt.bar([int(f[len(prefix):]) for f in prefix_features], ws[prefix_features])
    plt.title(f'FEATURE WEIGHTS FOR {prefix}')
    plt.ylabel('WEIGHT')
    plt.xlabel('DAY')
    plt.show()
    
plot_feature_weights_numbered('easter')
plot_feature_weights_numbered('dec')
plot_feature_weights_numbered('jan')

In [None]:
ws_sorted = ws.iloc[np.argsort(-np.abs(ws))]
ws_plot = ws_sorted.head(30)

plt.figure(figsize=(9, len(ws_plot) / 3))
plt.barh(np.arange(len(ws_plot)), ws_plot, color=ws_plot.apply(lambda ws: 'b' if ws >= 0 else 'y'))

plt.yticks(np.arange(len(ws_plot)), ws_plot.index)
plt.gca().invert_yaxis()
plt.title('MOST IMPORTANT FEATURES')
plt.show()

In [None]:
gdp_exponent = ws['gdp']
gdp_exponent