In [None]:
#Doc1
import pandas as pd
import numpy as np


from sklearn import model_selection
from sklearn import metrics, ensemble
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore') 
import statistics



#Doc 3
import datetime
from datetime import timedelta


Inspired + stolen from a much more thorough, educational notebook: https://www.kaggle.com/code/maxdiazbattan/wallmart-sales-top-5-eda-feature-engineering
The model and simple blend on my notebook was lifted from there. The EDA is wonderful and assisted with my adjustments.

This notebook is to show how a Christmas adjustment and adding a few more holidays can take 75-100 points off your score.

In [None]:
#Data loading
stores_data = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/stores.csv')
test_data = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/test.csv.zip')
train_data = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/train.csv.zip')
features_data = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/features.csv.zip')
sample_submission = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip')

In [None]:
#Data merging and converting dates to datetime
feature_store = features_data.merge(stores_data, how='inner', on= "Store")
feature_store['Date'] = pd.to_datetime(feature_store['Date'])
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

In [None]:
feature_store['Week'] = feature_store['Date'].dt.week
feature_store['Year'] = feature_store['Date'].dt.year
feature_store['Day'] = feature_store['Date'].dt.day

In [None]:
#useable dataframe merging
train_df = train_data.merge(feature_store, how='inner', on = ['Store', 'Date', 'IsHoliday']).sort_values(by=['Store', 'Dept', 'Date']).reset_index(drop=True)
test_df = test_data.merge(feature_store, how='inner', on = ['Store', 'Date', 'IsHoliday']).sort_values(by=['Store', 'Dept', 'Date']).reset_index(drop=True)

In [None]:
#Easter marking
train_df.loc[(train_df.Year==2010) & (train_df.Week==13), 'IsHoliday'] = True
train_df.loc[(train_df.Year==2011) & (train_df.Week==16), 'IsHoliday'] = True
train_df.loc[(train_df.Year==2012) & (train_df.Week==14), 'IsHoliday'] = True
test_df.loc[(test_df.Year==2013) & (test_df.Week==13), 'IsHoliday'] = True

In [None]:
#Cinco De Mayo / Mother's Day
train_df.loc[(train_df.Year==2010) & (train_df.Week==18), 'IsHoliday'] = True
train_df.loc[(train_df.Year==2011) & (train_df.Week==18), 'IsHoliday'] = True
train_df.loc[(train_df.Year==2012) & (train_df.Week==18), 'IsHoliday'] = True
test_df.loc[(test_df.Year==2013) & (test_df.Week==18), 'IsHoliday'] = True

In [None]:
#July 4th
train_df.loc[(train_df.Year==2010) & (train_df.Week==26), 'IsHoliday'] = True
train_df.loc[(train_df.Year==2011) & (train_df.Week==26), 'IsHoliday'] = True
train_df.loc[(train_df.Year==2012) & (train_df.Week==27), 'IsHoliday'] = True
test_df.loc[(test_df.Year==2013) & (test_df.Week==27), 'IsHoliday'] = True

In [None]:
def type_conversion_full(final_data):
    final_data.Type = final_data.Type.apply(lambda x: 3 if x == 'A' else (2 if x == 'B' else 1))
    return final_data

train_df = type_conversion_full(train_df)
test_df = type_conversion_full(test_df)

In [None]:
train_min = train_df[['Store', 'Dept', 'IsHoliday', 'Size', 'Type', 'Week', 'Year', 'Day']].copy()
y = train_df[['Weekly_Sales']].copy()
X_train, X_test, y_train, y_test = train_test_split(train_min, y, random_state = 0, test_size=0.1)

In [None]:
RF = RandomForestRegressor()
RF.fit(X_train, y_train)
test = test_df[['Store', 'Dept', 'IsHoliday', 'Size', 'Type', 'Week', 'Year', 'Day']].copy()
predict_rf = RF.predict(test)

In [None]:
ETR = ensemble.ExtraTreesRegressor(bootstrap = True, random_state = 0)
ETR.fit(X_train, y_train)
predict_etr = ETR.predict(test)

In [None]:
avg_preds = (predict_rf + predict_etr) / 2

In [None]:
test_strip = test_df[['Store', 'Dept', 'Date', 'Week', 'Year']]
test_strip['Weekly_Sales'] = avg_preds

In [None]:
def week_51_adj(row):
    compareval = test_strip[(test_strip['Store'] == row.Store) & (test_strip['Dept'] == row.Dept) & (test_strip['Week'] == 52)]
    if compareval.empty:
        return row.Weekly_Sales
    elif (row.Weekly_Sales > 1.5 * compareval.Weekly_Sales.median()):
        return (row.Weekly_Sales * 0.85)
    else:
        return row.Weekly_Sales
    
def week_52_adj(row):
    compareval = test_strip[(test_strip['Store'] == row.Store) & (test_strip['Dept'] == row.Dept) & (test_strip['Week'] == 51)]
    if compareval.empty:
        return row.Weekly_Sales
    elif (row.Weekly_Sales * 1.275 < compareval.Weekly_Sales.median()):
        return (row.Weekly_Sales * 1.2)
    else:
        return row.Weekly_Sales

In [None]:
test_strip['Weekly_Sales'] = test_strip.apply(lambda row: week_51_adj(row) if row.Week == 51 else row.Weekly_Sales, axis = 1)
test_strip['Weekly_Sales'] = test_strip.apply(lambda row: week_52_adj(row) if row.Week == 52 else row.Weekly_Sales, axis = 1)

In [None]:
sample_submission['Weekly_Sales'] = test_strip['Weekly_Sales']
sample_submission.to_csv('submission.csv', index=False)