In [6]:
import sys
import os

sys.path.append(os.path.abspath("../src"))

In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
from preprocessing import preprocess_pipeline


In [8]:
# Run Preprocessing Pipeline
data_path = '../data/store-sales-time-series-forecasting/'
df = preprocess_pipeline(data_path)
df.head()

# Select features and target
features = ['store_nbr', 'family', 'onpromotion', 'dayofweek', 'month', 'year', 'day', 'is_weekend']
X = df[features]
y = df['sales']

# Train Ridge Regression model
model = Ridge(alpha=1.0)
model.fit(X, y)

# Load and prepare test data
test = pd.read_csv(f'{data_path}/test.csv', parse_dates=['date'])
test['dayofweek'] = test['date'].dt.dayofweek
test['month'] = test['date'].dt.month
test['year'] = test['date'].dt.year
test['day'] = test['date'].dt.day
test['is_weekend'] = test['dayofweek'].isin([5, 6]).astype(int)
test['family'] = test['family'].astype('category').cat.codes
test['store_nbr'] = test['store_nbr'].astype('category')
X_test = test[features]

# Make predictions and reverse log transform
y_pred = model.predict(X_test)
y_pred = np.expm1(y_pred)
y_pred = np.clip(y_pred, 0, None)

# Format submission
submission = pd.DataFrame({
    'id': test['id'],
    'sales': y_pred
})
submission.to_csv('submission.csv', index=False)
submission.head()


  df.fillna(method='bfill', inplace=True)
  y_pred = np.expm1(y_pred)


Unnamed: 0,id,sales
0,3000888,1.4964779999999999e+137
1,3000889,2.628395e+132
2,3000890,1.144484e+161
3,3000891,inf
4,3000892,1.424137e+118


In [9]:
from xgboost import XGBRegressor

# Convert categorical features to numeric codes for XGBoost
X_xgb = X.copy()
X_test_xgb = X_test.copy()

for col in X_xgb.select_dtypes(include='category').columns:
    X_xgb[col] = X_xgb[col].cat.codes

for col in X_test_xgb.select_dtypes(include='category').columns:
    X_test_xgb[col] = X_test_xgb[col].cat.codes

'''# Train XGBoost model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_xgb, y)'''


from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_log_error, make_scorer

# Define RMSLE scorer
def rmsle(y_true, y_pred):
    y_pred = np.clip(y_pred, 0, None)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

# TimeSeriesSplit CV
tscv = TimeSeriesSplit(n_splits=5)

xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

cv_scores = cross_val_score(xgb_model, X_xgb, y, cv=tscv, scoring=rmsle_scorer)

print("Cross-validated RMSLE scores:", -cv_scores)
print("Mean RMSLE:", -np.mean(cv_scores))

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 300],
    'learning_rate': [0.05, 0.1],
    'max_depth': [4, 6, 8]
}

grid = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    cv=tscv,
    scoring=rmsle_scorer,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_xgb, y)

print("Best RMSLE score:", -grid.best_score_)
print("Best parameters:", grid.best_params_)

# Final model
xgb_model = grid.best_estimator_


# Predict with XGBoost
xgb_preds = xgb_model.predict(X_test_xgb)
# xgb_preds = np.expm1(xgb_preds)
xgb_preds = np.clip(xgb_preds, 0, None)

# Save XGBoost submission
submission_xgb = pd.DataFrame({
    'id': test['id'],
    'sales': xgb_preds
})
submission_xgb.to_csv('submission_xgb.csv', index=False)
submission_xgb.head()


Cross-validated RMSLE scores: [1.83748724 2.26070883 1.83960402 1.81417889 1.7781258 ]
Mean RMSLE: 1.9060209547159754
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best RMSLE score: 1.6515058556725566
Best parameters: {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 300}


Unnamed: 0,id,sales
0,3000888,12.20196
1,3000889,8.397295
2,3000890,0.0
3,3000891,2265.356689
4,3000892,18.902685
