In [1]:
# Import necessary libraries and modules
import os
import sys
import pandas as pd

sys.path.insert(0, os.path.dirname(os.getcwd()))
from scripts.logger import Logger
from scripts.data_cleaning import DataCleaner
from scripts.data_processing import DataProcessor

# Suppress FutureWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Initialize logger
logger = Logger('rossmann_analysis.log')

In [3]:
#  Load Data
logger.log('Loading data...')
train_data = pd.read_csv('../data/train.csv', low_memory=False, index_col=False)
test_data = pd.read_csv('../data/test.csv', low_memory= False, index_col=False)
store_data = pd.read_csv('../data/store.csv', low_memory= False, index_col=False)

logger.log('Data loaded successfully.')

2024-09-24 17:56:58,599 - INFO - Loading data...
2024-09-24 17:56:59,093 - INFO - Data loaded successfully.


In [4]:
# Initialize DataCleaner
cleaner = DataCleaner(logger)

# Clean the data
train_data, test_data = cleaner.fill_missing_values(train_data, test_data, store_data)


2024-09-24 17:56:59,110 - INFO - Merging store data with train and test datasets.
2024-09-24 17:56:59,226 - INFO - Filling missing values in CompetitionDistance.
2024-09-24 17:56:59,243 - INFO - Filling missing values in CompetitionOpenSinceMonth.
2024-09-24 17:56:59,243 - INFO - Filling missing values in CompetitionOpenSinceYear.
2024-09-24 17:56:59,259 - INFO - Filling missing values in Promo2SinceWeek.
2024-09-24 17:56:59,259 - INFO - Filling missing values in Promo2SinceYear.
2024-09-24 17:56:59,277 - INFO - Filling missing values in PromoInterval.
2024-09-24 17:56:59,310 - INFO - Data cleaning complete.


In [5]:
# Initialize DataProcessor for feature engineering and preprocessing
processor = DataProcessor(train_data, test_data, logger)

In [6]:
# Preprocess the train data (includes encoding, dropping unnecessary columns, and scaling)
X_train, X_test, y_train, y_test = processor.preprocess()

2024-09-24 17:56:59,350 - INFO - Starting feature engineering for training data.
2024-09-24 18:00:11,401 - INFO - Encoding StoreType.
2024-09-24 18:00:11,521 - INFO - Encoding Assortment.
2024-09-24 18:00:11,637 - INFO - Encoding StateHoliday.
2024-09-24 18:00:11,739 - INFO - Encoding PromoInterval.
2024-09-24 18:00:11,855 - INFO - Encoding complete.
2024-09-24 18:00:12,095 - INFO - Scaling the numerical features.
