We are provided with historical sales data for 1,115 Rossmann stores. The task is to forecast the "Sales" column for the test set. Some stores in the dataset were temporarily closed for refurbishment.

Files
- train.csv - historical data including Sales
- test.csv - historical data excluding Sales
- sample_submission.csv - a sample submission file in the correct format
- store.csv - supplemental information about the stores

- Id - an Id that represents a (Store, Date) duple within the test set
- Store - a unique Id for each store
- Sales - the turnover for any given day (this is what you are predicting)
- Customers - the number of customers on a given day
- Open - an indicator for whether the store was open: 0 = closed, 1 = open
- StateHoliday - indicates a state holiday. Normally all stores, with few exceptions, are closed on state holidays. Note - that all schools are closed on public holidays and weekends. a = public holiday, b = Easter holiday, c = Christmas, 0 = None
- SchoolHoliday - indicates if the (Store, Date) was affected by the closure of public schools
- StoreType - differentiates between 4 different store models: a, b, c, d
- Assortment - describes an assortment level: a = basic, b = extra, c = extended
- CompetitionDistance - distance in meters to the nearest competitor store
- CompetitionOpenSince[Month/Year] - gives the approximate year and month of the time the nearest competitor was opened
- Promo - indicates whether a store is running a promo on that day
- Promo2 - Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating
- Promo2Since[Year/Week] - describes the year and calendar week when the store started participating in Promo2
- PromoInterval - describes the consecutive intervals Promo2 is started, naming the months the promotion is started anew. E.g. "Feb,May,Aug,Nov" means each round starts in February, May, August, November of any given year for that store

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn import model_selection

import datetime

import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/rossmann-store-sales/train.csv')

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train.isnull().sum()

In [None]:
df_train['DayOfWeek'].value_counts()

In [None]:
len(df_train['Store'].unique())

In [None]:
print('Open:\n', df_train['Open'].value_counts(), '\n\n')
print('Promo\n', df_train['Promo'].value_counts(), '\n\n')
print('State Holiday\n', df_train['StateHoliday'].value_counts(), '\n\n')
print('School Holiday\n', df_train['SchoolHoliday'].value_counts())

In [None]:
df_train[df_train['StateHoliday'] == 0]['StateHoliday'].value_counts()

In [None]:
df_train[df_train['StateHoliday'] == '0']['StateHoliday'].value_counts()

In [None]:
df_train['StateHoliday'] = df_train['StateHoliday'].apply(lambda x: 0 if x == '0' else x)
df_train['StateHoliday'].value_counts()

In [None]:
df_train.info()

In [None]:
df_train.info()

In [None]:
df_store = pd.read_csv('../input/rossmann-store-sales/store.csv')
df_store.head()

In [None]:
df_store.shape

In [None]:
df_store.isnull().sum()

In [None]:
df_store.info()

In [None]:
df_store['StoreType'].value_counts()

In [None]:
df_store['Assortment'].value_counts()

In [None]:
df_store['StoreType'] = df_store['StoreType'].apply(lambda x: 1 if x == 'a' else (2 if x == 'b' else (3 if x == 'c' else 4)))
df_store['StoreType'].value_counts()

In [None]:
df_store['Assortment'] = df_store['Assortment'].apply(lambda x: 1 if x == 'a' else (2 if x == 'b' else 3))
df_store['Assortment'].value_counts()

In [None]:
max(df_store['CompetitionDistance'])

In [None]:
df_store['CompetitionDistance'] = df_store['CompetitionDistance'].fillna(max(df_store['CompetitionDistance']))
df_store.info()

In [None]:
def mapping(features):
    for feature in features:
        temp_dict = {}
        temp_dict = pd.Series(df_store[feature].values, index = df_store['Store']).to_dict()
        df_train[feature] = df_train['Store'].map(temp_dict)

In [None]:
mapping(['StoreType', 'Assortment', 'CompetitionDistance'])

In [None]:
df_train[df_train['Store'] == 1].head(10)

In [None]:
df_train[df_train['Sales'] == 0]

In [None]:
df_train[df_train['Open'] == 0]

In [None]:
df_train[df_train['Open'] == 0]['Sales'].value_counts()

In [None]:
df_train = df_train[df_train['Open'] == 1]
df_train.shape

In [None]:
df_train['Open'].value_counts()

In [None]:
df_train.drop('Open', inplace = True, axis = 1)
df_train.shape

In [None]:
print('Promo\n', df_train['Promo'].value_counts(), '\n\n')
print('State Holiday\n', df_train['StateHoliday'].value_counts(), '\n\n')
print('School Holiday\n', df_train['SchoolHoliday'].value_counts())

In [None]:
df_train['StateHoliday'] = df_train['StateHoliday'].apply(lambda x: 1 if x == 'a' else (2 if x == 'b' else (3 if x == 'c' else x)))
df_train['StateHoliday'].value_counts()

In [None]:
df_train.info()

In [None]:
df_train['DayOfYear'] = df_train['Date'].map(lambda x: datetime.datetime.strptime(str(x),'%Y-%m-%d').timetuple().tm_yday)
df_train.head(10)

In [None]:
df_train['Date'] = pd.to_datetime(df_train['Date'], format = '%Y-%m-%d')

In [None]:
df_train['Year'] = df_train['Date'].map(lambda x: x.year)
df_train.head()

In [None]:
df_train.drop('Date', inplace = True, axis = 1)
df_train.drop('Customers', inplace = True, axis = 1)
df_train.shape

In [None]:
df_train.head()

In [None]:
sns.distplot(df_train['Sales'])
plt.show()

In [None]:
sns.barplot(x = df_train['StoreType'], y = df_train['Sales'])
plt.show()

In [None]:
sns.barplot(x = df_train['Assortment'], y = df_train['Sales'])
plt.show()

In [None]:
sns.barplot(x = df_train['Promo'], y = df_train['Sales'])
plt.show()

In [None]:
sns.barplot(x = df_train['StateHoliday'], y = df_train['Sales'])
plt.show()

In [None]:
sns.barplot(x = df_train['SchoolHoliday'], y = df_train['Sales'])
plt.show()

In [None]:
df_train.sample()

In [None]:
X = df_train.drop('Sales', axis = 1)
y = df_train['Sales']

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state = 53)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state = 53)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

## Scalling

In [None]:
scaler = preprocessing.StandardScaler()

In [None]:
X_train_scalled = scaler.fit_transform(X_train)

X_val_scalled = scaler.transform(X_val)

X_test_scalled = scaler.transform(X_test)

# Linear Regresson

In [None]:
linreg = LinearRegression()

In [None]:
linreg.fit(X_train_scalled, y_train)

In [None]:
y_val_pred = linreg.predict(X_val_scalled)

In [None]:
y_train_pred = linreg.predict(X_train_scalled)

In [None]:
data = pd.DataFrame({'Actual':y_val, 'Predicted':y_val_pred})
data

In [None]:
r2_score(y_val, y_val_pred)

In [None]:
r2_score(y_train, y_train_pred)

In [None]:
sns.scatterplot(x = y_train, y = y_train_pred)
plt.show()

In [None]:
sns.scatterplot(x = y_val, y = y_val_pred)
plt.show()

In [None]:
mae = metrics.mean_absolute_error(y_val, y_val_pred)

mse = metrics.mean_squared_error(y_val, y_val_pred)

rmse = np.sqrt(metrics.mean_absolute_error(y_val, y_val_pred))

print("Mean Absolute Error")
print(mae)
print()

print("Mean Squared Error")
print(mse)
print()

print("Root Mean Squared Error")
print(rmse)

In [None]:
evaluation = pd.DataFrame()

In [None]:
def evaluation_df(method, mae, mse, rmse, evaluation):
    temp_evaluation = pd.DataFrame({'Method':[method], 'MAE': [mae], 'MSE': [mse], 'RMSE': [rmse]})
    evaluation = pd.concat([evaluation, temp_evaluation])
    evaluation = evaluation[['Method', 'MAE', 'MSE', 'RMSE']]
    return evaluation

In [None]:
evaluation = evaluation_df('Linear Regression', mae, mse, rmse, evaluation)

In [None]:
evaluation

In [None]:
fig = plt.figure()
sns.distplot((y_train - y_train_pred), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)
plt.xlabel('Errors', fontsize = 18)

In [None]:
fig = plt.figure()
sns.distplot((y_val - y_val_pred), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)
plt.xlabel('Errors', fontsize = 18)

# XGBoost

In [None]:
!pip install xgboost

In [None]:
%%time
xgbreg = xgb.XGBRegressor()
xgbreg.fit(X_train_scalled, y_train)

In [None]:
xgbreg.score(X_train_scalled, y_train)

In [None]:
y_train_pred = xgbreg.predict(X_train_scalled)

In [None]:
r2_score(y_train, y_train_pred)

In [None]:
y_val_pred = xgbreg.predict(X_val_scalled)

In [None]:
r2_score(y_val, y_val_pred)

In [None]:
mae = metrics.mean_absolute_error(y_val, y_val_pred)

mse = metrics.mean_squared_error(y_val, y_val_pred)

rmse = np.sqrt(metrics.mean_absolute_error(y_val, y_val_pred))

print("Mean Absolute Error")
print(mae)
print()

print("Mean Squared Error")
print(mse)
print()

print("Root Mean Squared Error")
print(rmse)

In [None]:
evaluation = evaluation_df('Extreme Gradient Boosting', mae, mse, rmse, evaluation)

In [None]:
evaluation

In [None]:
# parameters = {'learning_rate': [0.35, 0.375, 0.4, 0.425, 0.45],
#               'max_depth': [3, 4, 5, 6, 7],
#               'min_child_weight': [15, 18, 20, 22, 25],
#               'n_estimators': [80, 90, 100, 110, 120]}

In [None]:
# reg_xgb = model_selection.GridSearchCV(estimator=xgbreg,
#                                        param_grid=parameters,
#                                        n_jobs=-1,
#                                        cv=3,
#                                        refit=True)

# reg_xgb.fit(X_train_scalled, y_train)


# OUTPUT

# GridSearchCV(cv=3,
#              estimator=XGBRegressor(base_score=0.5, booster='gbtree',
#                                     colsample_bylevel=1, colsample_bynode=1,
#                                     colsample_bytree=1, gamma=0, gpu_id=-1,
#                                     importance_type='gain',
#                                     interaction_constraints='',
#                                     learning_rate=0.300000012, max_delta_step=0,
#                                     max_depth=6, min_child_weight=1,
#                                     missing=nan, monotone_constraints='()',
#                                     n_estimators=100, n_jobs=16,
#                                     num_parallel_tree=1, random_state=0,
#                                     reg_alpha=0, reg_lambda=1,
#                                     scale_pos_weight=1, subsample=1,
#                                     tree_method='exact', validate_parameters=1,
#                                     verbosity=None),
#              n_jobs=-1,
#              param_grid={'learning_rate': [0.35, 0.375, 0.4, 0.425, 0.45],
#                          'max_depth': [3, 4, 5, 6, 7],
#                          'min_child_weight': [15, 18, 20, 22, 25],
#                          'n_estimators': [80, 90, 100, 110, 120]})

In [None]:
# print(reg_xgb.best_params_)
# print(reg_xgb.best_score_)
# print(reg_xgb.best_estimator_)


# OUTPUT



# {'learning_rate': 0.45, 'max_depth': 7, 'min_child_weight': 15, 'n_estimators': 120}
# 0.9136767096687833
# XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
#              importance_type='gain', interaction_constraints='',
#              learning_rate=0.45, max_delta_step=0, max_depth=7,
#              min_child_weight=15, missing=nan, monotone_constraints='()',
#              n_estimators=120, n_jobs=16, num_parallel_tree=1, random_state=0,
#              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
#              tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
# print('r2_score on test data =', metrics.r2_score(y_true=y_val,
#                                                   y_pred=reg_xgb.predict(X_val_scalled),
#                                                   multioutput='variance_weighted'))



# OUTPUT


# r2_score on test data = 0.9116415466815296

In [None]:
%%time
xgbreg = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.45, max_delta_step=0, max_depth=7,
             min_child_weight=15, monotone_constraints='()',
             n_estimators=120, n_jobs=16, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)


xgbreg.fit(X_train_scalled, y_train)

In [None]:
xgbreg.score(X_train_scalled, y_train)

In [None]:
y_train_pred = xgbreg.predict(X_train_scalled)

In [None]:
r2_score(y_train, y_train_pred)

In [None]:
y_val_pred = xgbreg.predict(X_val_scalled)

In [None]:
r2_score(y_val, y_val_pred)

In [None]:
mae = metrics.mean_absolute_error(y_val, y_val_pred)

mse = metrics.mean_squared_error(y_val, y_val_pred)

rmse = np.sqrt(metrics.mean_absolute_error(y_val, y_val_pred))

print("Mean Absolute Error")
print(mae)
print()

print("Mean Squared Error")
print(mse)
print()

print("Root Mean Squared Error")
print(rmse)

In [None]:
evaluation = evaluation_df('Extreme Gradient Boosting Tuning 1', mae, mse, rmse, evaluation)

In [None]:
evaluation

In [None]:
# parameters = {'learning_rate': [0.1, 0.13, 0.17, 0.2, 0.23, 0.27, 0.3, 0.33],
#               'n_estimators': [200, 300, 400, 600, 900, 1200, 1800, 2200, 3000]}

In [None]:
# reg_xgb = model_selection.GridSearchCV(estimator=xgbreg,
#                                        param_grid=parameters,
#                                        n_jobs=-1,
#                                        cv=3,
#                                        refit=True)

# reg_xgb.fit(X_train_scalled, y_train)


# OUTPUT


# GridSearchCV(cv=3,
#              estimator=XGBRegressor(base_score=0.5, booster='gbtree',
#                                     colsample_bylevel=1, colsample_bynode=1,
#                                     colsample_bytree=1, gamma=0, gpu_id=-1,
#                                     importance_type='gain',
#                                     interaction_constraints='',
#                                     learning_rate=0.300000012, max_delta_step=0,
#                                     max_depth=6, min_child_weight=1,
#                                     missing=nan, monotone_constraints='()',
#                                     n_estimators=100, n_jobs=16,
#                                     num_parallel_tree=1, random_state=0,
#                                     reg_alpha=0, reg_lambda=1,
#                                     scale_pos_weight=1, subsample=1,
#                                     tree_method='exact', validate_parameters=1,
#                                     verbosity=None),
#              n_jobs=-1,
#              param_grid={'learning_rate': [0.1, 0.13, 0.17, 0.2, 0.23, 0.27,
#                                            0.3, 0.33],
#                          'n_estimators': [200, 300, 400, 600, 900, 1200, 1800,
#                                           2200, 3000]})

In [None]:
# print(reg_xgb.best_params_)
# print(reg_xgb.best_score_)
# print(reg_xgb.best_estimator_)


# OUTPUT


# {'learning_rate': 0.3, 'n_estimators': 3000}
# 0.9485298491109558
# XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
#              importance_type='gain', interaction_constraints='',
#              learning_rate=0.3, max_delta_step=0, max_depth=6,
#              min_child_weight=1, missing=nan, monotone_constraints='()',
#              n_estimators=3000, n_jobs=16, num_parallel_tree=1, random_state=0,
#              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
#              tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
# print('r2_score on test data =', metrics.r2_score(y_true=y_val,
#                                                   y_pred=reg_xgb.predict(X_val_scalled),
#                                                   multioutput='variance_weighted'))


# OUTPUT


# r2_score on test data = 0.9512953565016975

In [None]:
%%time
xgbreg = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.3, max_delta_step=0, max_depth=6,
             min_child_weight=1, monotone_constraints='()',
             n_estimators=3000, n_jobs=16, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

xgbreg.fit(X_train_scalled, y_train)

In [None]:
y_train_pred = xgbreg.predict(X_train_scalled)

In [None]:
r2_score(y_train, y_train_pred)

In [None]:
y_val_pred = xgbreg.predict(X_val_scalled)

In [None]:
r2_score(y_val, y_val_pred)

In [None]:
mae = metrics.mean_absolute_error(y_val, y_val_pred)

mse = metrics.mean_squared_error(y_val, y_val_pred)

rmse = np.sqrt(metrics.mean_absolute_error(y_val, y_val_pred))

print("Mean Absolute Error")
print(mae)
print()

print("Mean Squared Error")
print(mse)
print()

print("Root Mean Squared Error")
print(rmse)

In [None]:
evaluation = evaluation_df('Extreme Gradient Boosting with Tuning 2', mae, mse, rmse, evaluation)

In [None]:
evaluation

In [None]:
df_train.head()

In [None]:
df_store.head()

In [None]:
df_store.isnull().sum() * 100 / df_store.shape[0]

In [None]:
mapping(['Promo2'])

In [None]:
df_train.head()

In [None]:
df_train['CompetitionDistanceLog10'] = np.log10(df_train['CompetitionDistance'])
df_train.head()

In [None]:
df_train.drop('CompetitionDistance', inplace = True, axis = 1)
df_train.head()

## Splitting into X, y and train, test

In [None]:
X = df_train.drop(['Sales', 'Store'], axis = 1)
y = df_train['Sales']

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state = 53)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 53)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

In [None]:
%%time
xgbreg = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.3, max_delta_step=0, max_depth=6,
             min_child_weight=1, monotone_constraints='()',
             n_estimators=3000, n_jobs=16, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

xgbreg.fit(X_train, y_train)

In [None]:
y_train_pred = xgbreg.predict(X_train)

In [None]:
r2_score(y_train, y_train_pred)

In [None]:
y_val_pred = xgbreg.predict(X_val)

In [None]:
r2_score(y_val, y_val_pred)

In [None]:
mae = metrics.mean_absolute_error(y_val, y_val_pred)

mse = metrics.mean_squared_error(y_val, y_val_pred)

rmse = np.sqrt(metrics.mean_absolute_error(y_val, y_val_pred))

print("Mean Absolute Error")
print(mae)
print()

print("Mean Squared Error")
print(mse)
print()

print("Root Mean Squared Error")
print(rmse)

In [None]:
evaluation = evaluation_df('Extreme Gradient Boosting with Change in Data', mae, mse, rmse, evaluation)

In [None]:
evaluation

In [None]:
X = df_train.drop(['Sales', 'Year'], axis = 1)
y = df_train['Sales']

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state = 53)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 53)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

In [None]:
%%time
xgbreg = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.3, max_delta_step=0, max_depth=5,
             min_child_weight=1, monotone_constraints='()',
             n_estimators=4500, n_jobs=16, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

xgbreg.fit(X_train, y_train)

In [None]:
y_train_pred = xgbreg.predict(X_train)

In [None]:
r2_score(y_train, y_train_pred)

In [None]:
y_val_pred = xgbreg.predict(X_val)

In [None]:
r2_score(y_val, y_val_pred)

In [None]:
y_test_pred = xgbreg.predict(X_test)

In [None]:
r2_score(y_test, y_test_pred)

In [None]:
mae = metrics.mean_absolute_error(y_val, y_val_pred)

mse = metrics.mean_squared_error(y_val, y_val_pred)

rmse = np.sqrt(metrics.mean_absolute_error(y_val, y_val_pred))

print("Mean Absolute Error")
print(mae)
print()

print("Mean Squared Error")
print(mse)
print()

print("Root Mean Squared Error")
print(rmse)

In [None]:
evaluation = evaluation_df('Extreme Gradient Boosting with Change in Data and tuning', mae, mse, rmse, evaluation)

In [None]:
evaluation

In [None]:
X_train.head()

In [None]:
df_test = pd.read_csv('../input/rossmann-store-sales/test.csv')
df_test.head()

In [None]:
df_test.shape

In [None]:
df_test.info()

In [None]:
df_test.isnull().sum()

In [None]:
df_test[df_test['Open'].isnull()]

In [None]:
df_test[df_test['Open'].isnull()]['Date'].value_counts()

In [None]:
df_test[df_test['Store'] == 622]['Open'].value_counts()

#### Since these null value records have no state or school holidays, I'm considering that the store was open on those days

In [None]:
df_test['Open'] = df_test['Open'].fillna(1)
df_test.isnull().sum()

In [None]:
def mapping(features):
    for feature in features:
        temp_dict = {}
        temp_dict = pd.Series(df_store[feature].values, index = df_store['Store']).to_dict()
        df_test[feature] = df_test['Store'].map(temp_dict)

In [None]:
mapping(['StoreType', 'Assortment', 'Promo2', 'CompetitionDistance'])

In [None]:
df_test.head()

In [None]:
df_test['CompetitionDistanceLog10'] = np.log10(df_test['CompetitionDistance'])
df_test.head()

In [None]:
df_test.drop('CompetitionDistance', inplace = True, axis = 1)
df_test.head()

In [None]:
df_test['DayOfYear'] = df_test['Date'].map(lambda x: datetime.datetime.strptime(str(x),'%Y-%m-%d').timetuple().tm_yday)
df_test.head(10)

In [None]:
df_test['Date'] = pd.to_datetime(df_test['Date'], format = '%Y-%m-%d')

In [None]:
df_test['Year'] = df_test['Date'].map(lambda x: x.year)

In [None]:
df_test.drop('Date', inplace = True, axis = 1)

In [None]:
df_test.head()

In [None]:
df_test['StateHoliday'].value_counts()

In [None]:
df_test['StateHoliday'] = df_test['StateHoliday'].apply(lambda x: 1 if x == 'a' else (2 if x == 'b' else (3 if x == 'c' else x)))
df_test['StateHoliday'].value_counts()

In [None]:
df_test['StateHoliday'][0] == '0'

In [None]:
df_test['StateHoliday'] = df_test['StateHoliday'].map(lambda x: 0 if x == '0' else x)
type(df_test['StateHoliday'][0])

In [None]:
df_test.info()

In [None]:
df_train.head(2)

In [None]:
df_test.head(2)

In [None]:
df_test_open = df_test[df_test['Open'] == 1]
df_test_closed = df_test[df_test['Open'] == 0]

In [None]:
df_test_closed['Sales'] = 0

In [None]:
df_test_open.drop('Open', inplace = True, axis = 1)
df_test_closed.drop('Open', inplace = True, axis = 1)

In [None]:
df_test_closed.head()

In [None]:
df_test_open.shape, df_test_closed.shape

In [None]:
df_test_open.drop('Year', inplace = True, axis = 1)
df_test_closed.drop('Year', inplace = True, axis = 1)

In [None]:
df_test_open.head()

In [None]:
X_train.head()

In [None]:
X = df_test_open.drop(['Id'], axis = 1)

In [None]:
X.shape

In [None]:
cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'DayOfYear', 'Promo2', 'CompetitionDistanceLog10']
X = X[cols]

In [None]:
X_pred = xgbreg.predict(X)

In [None]:
X_pred.shape

In [None]:
df_test_open['Sales'] = X_pred

In [None]:
df_test_open.head()

In [None]:
df_test_final = pd.concat([df_test_open, df_test_closed])

In [None]:
df_test_final.shape

In [None]:
df_test_final.head()

In [None]:
sample_sub = pd.read_csv('../input/rossmann-store-sales/sample_submission.csv')
sample_sub.head()

In [None]:
temp_dict = {}
temp_dict = pd.Series(df_test_final['Sales'].values, index = df_test_final['Id']).to_dict()
sample_sub['Sales'] = sample_sub['Id'].map(temp_dict)

In [None]:
sample_sub.head(20)

In [None]:
df_test_final.head(20)

In [None]:
sample_sub.shape

In [None]:
sample_sub.to_csv('sample_submission.csv', header = ['Id', 'Sales'], index = False)