In [1]:
# Import necessary libraries and modules
import os
import sys
import pandas as pd

sys.path.insert(0, os.path.dirname(os.getcwd()))
from scripts.logger import Logger
from scripts.data_cleaning import DataCleaner
from scripts.data_processing import DataProcessor
from scripts.model_training import ModelTrainer

# Suppress FutureWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Initialize logger
logger = Logger('rossmann_analysis.log')

In [3]:
#  Load Data
logger.log('Loading data...')
train_data = pd.read_csv('../data/train.csv', low_memory=False, index_col=False)
test_data = pd.read_csv('../data/test.csv', low_memory= False, index_col=False)
store_data = pd.read_csv('../data/store.csv', low_memory= False, index_col=False)

logger.log('Data loaded successfully.')

2024-09-24 20:42:15,908 - INFO - Loading data...
2024-09-24 20:42:16,469 - INFO - Data loaded successfully.


In [4]:
# Initialize DataCleaner
cleaner = DataCleaner(logger)

# Clean the data
train_data, test_data = cleaner.fill_missing_values(train_data, test_data, store_data)


2024-09-24 20:42:16,485 - INFO - Merging store data with train and test datasets.


2024-09-24 20:42:16,635 - INFO - Filling missing values in CompetitionDistance.
2024-09-24 20:42:16,665 - INFO - Filling missing values in CompetitionOpenSinceMonth.
2024-09-24 20:42:16,671 - INFO - Filling missing values in CompetitionOpenSinceYear.
2024-09-24 20:42:16,679 - INFO - Filling missing values in Promo2SinceWeek.
2024-09-24 20:42:16,689 - INFO - Filling missing values in Promo2SinceYear.
2024-09-24 20:42:16,697 - INFO - Filling missing values in PromoInterval.
2024-09-24 20:42:16,734 - INFO - Data cleaning complete.


In [5]:
# Initialize DataProcessor for feature engineering and preprocessing
processor = DataProcessor(train_data, test_data, logger)

In [6]:
# Preprocess the train data (includes encoding, dropping unnecessary columns, and scaling)
X_train, X_test, y_train, y_test = processor.preprocess()

2024-09-24 20:42:16,771 - INFO - Starting feature engineering for training data.
2024-09-24 20:46:23,991 - INFO - Encoding StoreType.
2024-09-24 20:46:24,171 - INFO - Encoding Assortment.
2024-09-24 20:46:24,323 - INFO - Encoding StateHoliday.
2024-09-24 20:46:24,456 - INFO - Encoding PromoInterval.
2024-09-24 20:46:24,588 - INFO - Encoding complete.
2024-09-24 20:46:24,804 - INFO - Scaling the numerical features.


In [7]:
# Train the Random Forest model
trainer = ModelTrainer(logger)
trainer.train_model(X_train, y_train)


2024-09-24 20:46:25,369 - INFO - Training the Random Forest model.
2024-09-24 20:51:16,712 - INFO - Model training complete.


In [8]:
trainer.evaluate(X_test, y_test)


2024-09-24 20:51:16,818 - INFO - Making predictions.


2024-09-24 20:51:51,794 - INFO - Mean Squared Error (MSE): 930022.3322820455
2024-09-24 20:51:51,811 - INFO - Mean Absolute Error (MAE): 587.973852491915
2024-09-24 20:51:51,826 - INFO - Root Mean Squared Error (RMSE): 964.3766547786428
2024-09-24 20:51:51,828 - INFO - R-squared (R2): 0.9371130468314715
2024-09-24 20:51:51,828 - INFO - Mean Squared Logarithmic Error (MSLE): 0.022263582949272998


Mean Squared Error (MSE): 930022.3322820455
Mean Absolute Error (MAE): 587.973852491915
Root Mean Squared Error (RMSE): 964.3766547786428
R-squared (R2): 0.9371130468314715
Mean Squared Logarithmic Error (MSLE): 0.022263582949272998


(np.float64(930022.3322820455),
 np.float64(587.973852491915),
 np.float64(964.3766547786428),
 0.9371130468314715,
 np.float64(0.022263582949272998))

In [10]:
# Save the model
trainer.save_model('../models/random_forest_model.pkl')

2024-09-24 20:57:59,521 - INFO - Saving the model to ../models/random_forest_model.pkl
2024-09-24 20:58:41,551 - INFO - Model saved successfully.
