In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('../input/store-sales-time-series-forecasting/test.csv')

In [None]:
test

In [None]:
holiday = pd.read_csv('../input/store-sales-time-series-forecasting/holidays_events.csv')
holiday['is_holiday']=1
holiday

In [None]:
holiday['date'] = pd.to_datetime(holiday['date'])
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

relevant_holidays = holiday[
    (holiday['date'] >= train['date'].min()) & 
    (holiday['date'] <= test['date'].max())
]
relevant_holidays

In [None]:
train = train.merge(relevant_holidays[['date', 'is_holiday','locale']], on='date', how='left')
test = test.merge(relevant_holidays[['date', 'is_holiday','locale']], on='date', how='left')

train['is_holiday'] = train['is_holiday'].fillna(0)
test['is_holiday'] = test['is_holiday'].fillna(0)
train['locale'] = train['locale'].fillna('NoHoliday')
test['locale'] = test['locale'].fillna('NoHoliday')

In [None]:
train = train[train['sales'] <= 100000]

In [None]:
import seaborn as sns
sns.boxplot(x='is_holiday',y='sales',data=train)
sns.boxplot(x='locale',y='sales',data=train)

In [None]:
stores = pd.read_csv('../input/store-sales-time-series-forecasting/stores.csv')
train = train.merge(stores, on='store_nbr', how='left')
test = test.merge(stores, on='store_nbr', how='left')

train.drop('state', axis=1, inplace=True)
test.drop('state', axis=1, inplace=True)
train.drop('is_holiday', axis=1, inplace=True)
test.drop('is_holiday', axis=1, inplace=True)

In [None]:
train.drop('cluster', axis=1, inplace=True)
test.drop('cluster', axis=1, inplace=True)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 6))
sns.boxplot(x='city', y='sales', data=train)
plt.title('Sales Distribution by State')
plt.show()

plt.figure(figsize=(14, 6))
sns.boxplot(x='type', y='sales', data=train)
plt.title('Sales Distribution by Store Type')
plt.show()


In [None]:
train

In [None]:
train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year
train['weekday'] = train['date'].dt.weekday 

test['day'] = test['date'].dt.day
test['month'] = test['date'].dt.month
test['year'] = test['date'].dt.year
test['weekday'] = test['date'].dt.weekday

train = train.drop(columns=['date'])
test = test.drop(columns=['date'])


In [None]:
train = train[train['year']>=2015]
sns.barplot(x='year',y='sales',data=train)

In [None]:
import category_encoders as ce
categorical_features = ['store_nbr', 'family', 'city', 'type', 'month', 'weekday','locale']

encoder = ce.TargetEncoder(cols=categorical_features)

train_encoded = encoder.fit_transform(train[categorical_features], train['sales'])
test_encoded = encoder.transform(test[categorical_features])

train[categorical_features] = train_encoded
test[categorical_features] = test_encoded

train = train.drop(columns=['year', 'day'])
test = test.drop(columns=['year', 'day'])

In [None]:
train.update(train_encoded)  
test.update(test_encoded) 

In [None]:
train

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=['sales','id'])  
y = train['sales']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

rf = RandomForestRegressor(n_estimators=100, max_depth =10, random_state=42, n_jobs=-1)

rf.fit(X_train, y_train)

rf_pred = rf.predict(X_val)

rf_pred = [max(0, p) for p in rf_pred]  
rf_rmsle = mean_squared_log_error(y_val, rf_pred, squared=False)
print("Random Forest RMSLE:", rf_rmsle)

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=100, learning_rate=0.05,subsample=0.6, max_depth=10, random_state=42)

xgb.fit(X_train, y_train)

xgb_pred = xgb.predict(X_val)

xgb_pred = [max(0, p) for p in xgb_pred]  
xgb_rmsle = mean_squared_log_error(y_val, xgb_pred, squared=False)
print("XGBoost RMSLE:", xgb_rmsle)

In [None]:
rf_pred = np.array(rf_pred)
xgb_pred = np.array(xgb_pred)
w_rf = 0.8


w_xgb = 0.2
y_pred_ensemble_weighted = w_rf * rf_pred + w_xgb * xgb_pred

rmsle_ensemble = np.sqrt(mean_squared_log_error(y_val, y_pred_ensemble_weighted))
print(f'Ensembled RMSLE: {rmsle_ensemble:.4f}')


In [None]:
test

In [None]:
test1=test.copy()
test = test.drop(columns=['id'])  
rf_test_pred = rf.predict(test)
rf_test_pred = [max(0, p) for p in rf_test_pred]  

xgb_test_pred = xgb.predict(test)
xgb_test_pred = [max(0, p) for p in xgb_test_pred]  

rf_test_pred = np.array(rf_test_pred)
xgb_test_pred = np.array(xgb_test_pred)
test_pred_ensemble = w_rf * rf_test_pred + w_xgb * xgb_test_pred

submission = pd.DataFrame({'id': test1['id'], 'sales': test_pred_ensemble})
submission.to_csv('submission.csv', index=False)
print("Submission file generated: submission.csv")
