In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import randint, uniform
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_log_error, median_absolute_error, explained_variance_score

#load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
oil_data = pd.read_csv('oil.csv')
holiday_data = pd.read_csv('holidays_events.csv')

#imputers for handling missing values
numerical_imputer = SimpleImputer(strategy='mean')  # For numerical columns
categorical_imputer = SimpleImputer(strategy='most_frequent')  # For categorical columns

#filter out rows where 'transferred' is True
holiday_data = holiday_data[holiday_data['transferred'] == False]
holiday_dates = set(holiday_data['date'])

#merge the oil data with train and test data
train_data = pd.merge(train_data, oil_data, on='date', how='left')
test_data = pd.merge(test_data, oil_data, on='date', how='left')

#add a binary column to train and test data indicating if the date is a holiday
train_data['is_holiday'] = train_data['date'].apply(lambda x: 1 if x in holiday_dates else 0)
test_data['is_holiday'] = test_data['date'].apply(lambda x: 1 if x in holiday_dates else 0)

#fill missing oil prices using forward fill
train_data['dcoilwtico'].ffill(inplace=True)
test_data['dcoilwtico'].ffill(inplace=True)

#preprocessing pipelines
numerical_pipeline = Pipeline(steps=[
    ('imputer', numerical_imputer),
    ('scaler', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', categorical_imputer),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, ['onpromotion', 'dcoilwtico', 'is_holiday', 'store_nbr']),
        ('cat', categorical_pipeline, ['family'])
    ])

#hyperparas
param_distributions = {
    'model__n_estimators': randint(50, 1000), #made this large since data set is in the millions of rows
    'model__learning_rate': uniform(0.01, 0.3), 
    'model__max_depth': randint(2, 10),  
    'model__min_samples_split': randint(2, 10),  
    'model__min_samples_leaf': randint(1, 10), 
    'model__max_features': ['sqrt', 'log2'],  
    'model__subsample': uniform(0.5, 0.5),  #fraction of samples to be used for fitting the individual base learners
}

#scale the target variable 'sales' to range 0-1
target_scaler = MinMaxScaler()
y_train_scaled = target_scaler.fit_transform(train_data[['sales']])

#splitting the data for training and validation
X_train, X_val, y_train, y_val = train_test_split(
    train_data.drop('sales', axis=1), 
    y_train_scaled, 
    test_size=0.2, 
    random_state=42
)

#flattening y_train and y_val for model fitting and validation
y_train = y_train.ravel()
y_val = y_val.ravel()


#pipeline with GradientBoostingRegressor
model = GradientBoostingRegressor(random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

#randomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_distributions, 
    n_iter=10, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    random_state=42,
    n_jobs=-1
)

#fitting
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

print("Best hyperparameters:")
print(random_search.best_params_)

#validation
val_predictions_scaled = best_model.predict(X_val)

#inverse transform the scaled predictions
val_predictions = target_scaler.inverse_transform(val_predictions_scaled.reshape(-1, 1))

y_val_original = target_scaler.inverse_transform(y_val.reshape(-1, 1))

rmse = np.sqrt(mean_squared_error(y_val_original, val_predictions))
mae = mean_absolute_error(y_val_original, val_predictions)
r2 = r2_score(y_val_original, val_predictions)
msle = mean_squared_log_error(y_val_original, val_predictions)
median_ae = median_absolute_error(y_val_original, val_predictions)
explained_variance = explained_variance_score(y_val_original, val_predictions)

print(f'Validation RMSE: {rmse}')
print(f'Validation MAE: {mae}')
print(f'Validation R-squared: {r2}')
print(f'Validation MSLE: {msle}')
print(f'Validation Median Absolute Error: {median_ae}')
print(f'Validation Explained Variance Score: {explained_variance}')

# Final predictions on test data (can be used as needed)
# X_test = test_data.drop('id', axis=1)
# test_data['sales'] = best_model.predict(X_test)
# output = test_data[['id', 'sales']]
# output.to_csv('predictions.csv', index=False)
