In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
train = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip")
test = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip")
stores = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv")
Features = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip")

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Evaluation
from sklearn.metrics import mean_squared_error

In [5]:
# Convert Date columns to datetime
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
Features['Date'] = pd.to_datetime(Features['Date'])

In [6]:
# Merge train with stores on 'Store'
train_data = pd.merge(train, stores, on='Store', how='left')

# Merge with features on 'Store', 'Date', and 'IsHoliday'
train_data = pd.merge(train_data, Features, on=['Store', 'Date', 'IsHoliday'], how='left')


In [7]:
# Merge test with stores on 'Store'
test_data = pd.merge(test, stores, on='Store', how='left')

# Merge with features on 'Store', 'Date', and 'IsHoliday'
test_data = pd.merge(test_data, Features, on=['Store', 'Date', 'IsHoliday'], how='left')


In [8]:
train_data.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,,,,,,211.096358,8.106
1,1,1,2010-02-12,46039.49,True,A,151315,38.51,2.548,,,,,,211.24217,8.106
2,1,1,2010-02-19,41595.55,False,A,151315,39.93,2.514,,,,,,211.289143,8.106
3,1,1,2010-02-26,19403.54,False,A,151315,46.63,2.561,,,,,,211.319643,8.106
4,1,1,2010-03-05,21827.9,False,A,151315,46.5,2.625,,,,,,211.350143,8.106


In [9]:
# First, convert Date to datetime if not already
train_data['Date'] = pd.to_datetime(train_data['Date'])

# Known US holiday week anchors
def get_holiday_name(date):
    year = date.year
    if date == pd.Timestamp(f'{year}-11-23') or date.weekofyear in [47, 48]:
        return 'Thanksgiving'
    elif date.month == 12 and date.day in range(20, 27):
        return 'Christmas_Week'
    elif date.month == 11 and date.day in range(24, 30):
        return 'Black_Friday'
    elif date.month == 2 and date.day in range(10, 20):
        return 'Super_Bowl'
    elif date.month == 9 and date.day < 10:
        return 'Labor_Day'
    elif date.month == 7 and date.day in range(1, 8):
        return 'Independence_Day'
    else:
        return 'None'


In [10]:
train_data['Holiday_Name'] = train_data['Date'].apply(get_holiday_name)
test_data['Holiday_Name'] = test_data['Date'].apply(get_holiday_name)

In [11]:
# If not already
train_data['Date'] = pd.to_datetime(train_data['Date'])

# Feature: how early in the month the date is
train_data['Month_Start_Weight'] = 31 - train_data['Date'].dt.day


In [12]:
# # Group by week (optional if your data is already weekly)
# weekly_weight = (
#     train_data.groupby(['Store', 'Dept', 'Date'])['Month_Start_Weight']
#     .sum()
#     .reset_index()
# )

# # Merge back into your main DataFrame if you have a weekly target
# train_data = pd.merge(train_data, weekly_weight, on=['Store', 'Dept', 'Date'], how='left')


In [13]:
train_data['Month_Start_Weight'] = 31 - train_data['Date'].dt.day


In [14]:
# Ensure Date is datetime
test_data['Date'] = pd.to_datetime(test_data['Date'])

# Apply same transformation
test_data['Month_Start_Weight'] = 31 - test_data['Date'].dt.day


In [15]:
# Define numerical and categorical features
numeric_features = ['Store', 'Size', 'Dept', 'Year', 'Month', 'Week','Month_Start_Weight']
categorical_features = ['IsHoliday', 'Type','Temp_Bin', 'Holiday_Name']

# Numerical pipeline: Impute and scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: One-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Full preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [16]:
# # List of holidays in the dataset
# holidays = pd.to_datetime([
#     '2010-02-12',  # Super Bowl
#     '2010-09-10',  # Labor Day
#     '2010-11-26',  # Thanksgiving
#     '2010-12-31',  # Christmas
#     '2011-02-11',
#     '2011-09-09',
#     '2011-11-25',
#     '2011-12-30',
#     '2012-02-10',
#     '2012-09-07',
#     '2012-11-23',
#     '2012-12-28'
# ])


In [17]:
# def days_to_nearest_holiday(date):
#     return (holidays - date).days.min()

In [18]:
# Convert 'Date' to datetime and extract features
test_data['Date'] = pd.to_datetime(test_data['Date'])
test_data['Year'] = test_data['Date'].dt.year
test_data['Month'] = test_data['Date'].dt.month
test_data['Week'] = test_data['Date'].dt.isocalendar().week

In [19]:
# Extract date features
train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data['Year'] = train_data['Date'].dt.year
train_data['Month'] = train_data['Date'].dt.month
train_data['Week'] = train_data['Date'].dt.isocalendar().week


In [20]:
train_data.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,...,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Holiday_Name,Month_Start_Weight,Year,Month,Week
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,,...,,,,211.096358,8.106,,26,2010,2,5
1,1,1,2010-02-12,46039.49,True,A,151315,38.51,2.548,,...,,,,211.24217,8.106,Super_Bowl,19,2010,2,6
2,1,1,2010-02-19,41595.55,False,A,151315,39.93,2.514,,...,,,,211.289143,8.106,Super_Bowl,12,2010,2,7
3,1,1,2010-02-26,19403.54,False,A,151315,46.63,2.561,,...,,,,211.319643,8.106,,5,2010,2,8
4,1,1,2010-03-05,21827.9,False,A,151315,46.5,2.625,,...,,,,211.350143,8.106,,26,2010,3,9


In [21]:
# # Apply to training data
# train_data['Days_To_Holiday'] = train_data['Date'].apply(days_to_nearest_holiday)
# train_data['Is_Before_Holiday'] = (train_data['Days_To_Holiday'] <= 7).astype(int)
# train_data['Is_After_Holiday'] = (train_data['Days_To_Holiday'] == 0).astype(int)

# # Apply to test data
# test_data['Days_To_Holiday'] = test_data['Date'].apply(days_to_nearest_holiday)
# test_data['Is_Before_Holiday'] = (test_data['Days_To_Holiday'] <= 7).astype(int)
# test_data['Is_After_Holiday'] = (test_data['Days_To_Holiday'] == 0).astype(int)


In [44]:
train_data['MarkDownsSum'] = (
    train_data[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']]
    .fillna(0)
    .sum(axis=1)
)

test_data['MarkDownsSum'] = (
    test_data[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']]
    .fillna(0)
    .sum(axis=1)
)


In [23]:
# Add Day of Week and IsWeekend to training data
train_data['DayOfWeek'] = train_data['Date'].dt.dayofweek  # Monday = 0, Sunday = 6
train_data['IsWeekend'] = train_data['DayOfWeek'].isin([5, 6]).astype(int)

# Add Day of Week and IsWeekend to test data
test_data['DayOfWeek'] = test_data['Date'].dt.dayofweek
test_data['IsWeekend'] = test_data['DayOfWeek'].isin([5, 6]).astype(int)

In [24]:
# Make sure 'Date' is datetime and sorted
train_data = train_data.sort_values(['Store', 'Dept', 'Date'])

# Compute week-to-week temperature difference
train_data['Temperature_Diff'] = train_data.groupby(['Store'])['Temperature'].diff().fillna(0)


# Make sure 'Date' is datetime and sorted
test_data = test_data.sort_values(['Store', 'Dept', 'Date'])

# Compute week-to-week temperature difference
test_data['Temperature_Diff'] = test_data.groupby(['Store'])['Temperature'].diff().fillna(0)



In [25]:
# Compute week-to-week fuel price change
train_data['Fuel_Price_Diff'] = train_data.groupby(['Store'])['Fuel_Price'].diff().fillna(0)

# Compute week-to-week fuel price change
test_data['Fuel_Price_Diff'] = test_data.groupby(['Store'])['Fuel_Price'].diff().fillna(0)


In [26]:
# Create temperature bins for training data
train_data['Temp_Bin'] = pd.cut(train_data['Temperature'], bins=[-np.inf, 40, 70, np.inf], labels=['Cold', 'Moderate', 'Hot'])
# Create temperature bins for test data
test_data['Temp_Bin'] = pd.cut(test_data['Temperature'], bins=[-np.inf, 40, 70, np.inf], labels=['Cold', 'Moderate', 'Hot'])


In [27]:
train_data['CPI_Diff'] = train_data.groupby('Store')['CPI'].diff().fillna(0)

In [28]:
test_data['CPI_Diff'] = test_data.groupby('Store')['CPI'].diff().fillna(0)

In [45]:
features = ['Store', 'Dept', 'IsHoliday', 'Type', 'Size', 'Year', 'Month', 'Week',
            'Temperature_Diff', 'Fuel_Price_Diff', 'CPI_Diff', 'MarkDownsSum',
            'CPI','DayOfWeek','IsWeekend','Holiday_Name','Month_Start_Weight']

In [40]:
train_data.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,...,Year,Month,Week,MarkdownsSum,DayOfWeek,IsWeekend,Temperature_Diff,Fuel_Price_Diff,Temp_Bin,CPI_Diff
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,,...,2010,2,5,,4,0,0.0,0.0,Moderate,0.0
1,1,1,2010-02-12,46039.49,True,A,151315,38.51,2.548,,...,2010,2,6,,4,0,-3.8,-0.024,Cold,0.145812
2,1,1,2010-02-19,41595.55,False,A,151315,39.93,2.514,,...,2010,2,7,,4,0,1.42,-0.034,Cold,0.046973
3,1,1,2010-02-26,19403.54,False,A,151315,46.63,2.561,,...,2010,2,8,,4,0,6.7,0.047,Moderate,0.0305
4,1,1,2010-03-05,21827.9,False,A,151315,46.5,2.625,,...,2010,3,9,,4,0,-0.13,0.064,Moderate,0.0305


In [31]:
print(train_data.columns[-5:])  # Just to confirm

Index(['IsWeekend', 'Temperature_Diff', 'Fuel_Price_Diff', 'Temp_Bin',
       'CPI_Diff'],
      dtype='object')


In [46]:
X = train_data[features]
y = train_data['Weekly_Sales']

In [None]:
# from sklearn.model_selection import GridSearchCV
# from xgboost import XGBRegressor
# from sklearn.pipeline import Pipeline

# # Create the pipeline
# xgb_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),  # your earlier preprocessing block
#     ('regressor', XGBRegressor(
#         random_state=42,
#         tree_method='hist',  # Faster
#         verbosity=1
#     ))
# ])

# # Define the parameter grid
# param_grid = {
#     'regressor__n_estimators': [100, 300, 500],
#     'regressor__learning_rate': [0.01, 0.05, 0.1],
#     'regressor__max_depth': [4, 6, 8]
# }

# # Set up GridSearchCV
# grid_search = GridSearchCV(xgb_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=2, n_jobs=-1)

# # Run Grid Search
# grid_search.fit(X_train, y_train)

# # Best Parameters
# print("Best Parameters:", grid_search.best_params_)

# # Predict on validation set with best model
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_val)

# # Calculate RMSE
# rmse = np.sqrt(mean_squared_error(y_val, y_pred))
# print(f'Validation RMSE after tuning: {rmse:.2f}')


In [None]:
# print("Best Parameters:", grid_search.best_params_)

In [37]:
import xgboost as xgb

# You can alias if you want
xgbr = xgb.XGBRegressor


In [47]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#from sklearn.pipeline import Pipeline

# Build pipeline
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # your earlier preprocessing block
    ('regressor', xgbr(
        n_estimators=500,
        learning_rate=0.1,
        max_depth=8,
        random_state=42,
        tree_method='hist',  # Faster for large datasets
        verbosity=1
    ))
])

# Train the model
xgb_pipeline.fit(X_train, y_train)

# Predict on validation set
y_pred = xgb_pipeline.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'Validation RMSE: {rmse:.2f}')

Validation RMSE: 3101.89


In [None]:
# Merge test with stores data
# test_data = pd.merge(test, stores, on='Store')

# Convert 'Date' to datetime and extract features
test_data['Date'] = pd.to_datetime(test_data['Date'])
test_data['Year'] = test_data['Date'].dt.year
test_data['Month'] = test_data['Date'].dt.month
test_data['Week'] = test_data['Date'].dt.isocalendar().week

# Select features
#X_test = test_data[features]


In [None]:
# Select features
X_test = test_data[features]

In [52]:
model = xgb_pipeline.named_steps['regressor']

In [53]:
importances = model.feature_importances_

# Show feature importance
for col, score in sorted(zip(X.columns, importances), key=lambda x: x[1], reverse=True):
    print(f"{col}: {score:.4f}")


IsHoliday: 0.2067
Fuel_Price_Diff: 0.1811
Dept: 0.1235
CPI_Diff: 0.0862
IsWeekend: 0.0675
Temperature_Diff: 0.0457
Store: 0.0380
Size: 0.0254
Year: 0.0206
CPI: 0.0112
Month: 0.0056
MarkDownsSum: 0.0052
DayOfWeek: 0.0047
Week: 0.0046
Type: 0.0042
Holiday_Name: 0.0026
Month_Start_Weight: 0.0020
Temp_Bin: 0.0000


In [None]:
# Predict on test set
test_predictions = xgb_pipeline.predict(X_test)

# Prepare submission DataFrame
submission = pd.DataFrame({
    'Id': test_data['Store'].astype(str) + '_' + test_data['Dept'].astype(str) + '_' + test_data['Date'].dt.strftime('%Y-%m-%d'),
    'Weekly_Sales': test_predictions
})

# Save submission file
submission.to_csv('submission.csv', index=False)
print('Submission file saved!')


## 