In [1]:
import pandas as pd
import numpy as np
import os

print("Libraries imported.")

# Data Acquisition
TRAIN_PATH = '../data/raw/train.csv'
STORE_PATH = '../data/raw/store.csv'

try:
    train_df = pd.read_csv(TRAIN_PATH, low_memory=False)
    store_df = pd.read_csv(STORE_PATH)
    print("Raw data loaded successfully.")
    print(f"Train data shape: {train_df.shape}")
    print(f"Store data shape: {store_df.shape}")
except FileNotFoundError:
    print("ERROR: Make sure train.csv and store.csv are in the 'data/raw/' directory.")



Libraries imported.
Raw data loaded successfully.
Train data shape: (1017209, 9)
Store data shape: (1115, 10)


In [2]:
# Merging and Initial Cleaning
print("\nMerging train and store dataframes...")
df = pd.merge(train_df, store_df, on='Store', how='left')

# Converting Date column to datetime objects
df['Date'] = pd.to_datetime(df['Date'])

# Filtering out closed stores and days with zero sales, as they are not predictable
df = df[(df['Open'] == 1) & (df['Sales'] > 0)].copy()
print(f"Data shape after filtering closed/zero-sale days: {df.shape}")




Merging train and store dataframes...
Data shape after filtering closed/zero-sale days: (844338, 18)


In [3]:
# Handling Missing Values
print("\nHandling missing values...")

# Strategy: Fill CompetitionDistance with the median value of the column (This was already correct)
df['CompetitionDistance'] = df['CompetitionDistance'].fillna(df['CompetitionDistance'].median())

# Strategy: For competition 'open since' and promo 'since' features, NaNs likely mean no competition/promo. Fill with 0.
for col in ['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear']:
    df[col] = df[col].fillna(0)

df['PromoInterval'] = df['PromoInterval'].fillna('None')

print("Missing values handled.")
print(f"Remaining nulls:\n{df.isnull().sum().sort_values(ascending=False).head()}")




Handling missing values...
Missing values handled.
Remaining nulls:
Store        0
DayOfWeek    0
Date         0
Sales        0
Customers    0
dtype: int64


In [4]:
# Feature Engineering
print("\nEngineering new features from existing data...")

# Time-based features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['WeekOfYear'] = df['Date'].dt.isocalendar().week.astype(int)

# Promotion-based feature ('IsPromoMonth')
# This feature checks if a given sale date falls within a store's 'Promo2' interval
month_map = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 
             7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
df['MonthStr'] = df['Month'].map(month_map)
df['IsPromoMonth'] = df.apply(lambda row: 1 if row['Promo2'] == 1 and row['MonthStr'] in row['PromoInterval'] else 0, axis=1)

# Clean up helper column
df.drop(columns=['MonthStr'], inplace=True)

print("Feature engineering complete.")


Engineering new features from existing data...
Feature engineering complete.


In [5]:
# Saving Processed Data
PROCESSED_DATA_PATH = '../data/processed/df_featured.csv'

# Create the 'processed' directory if it doesn't exist
os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)

print(f"\nSaving cleaned and featured data to: {PROCESSED_DATA_PATH}")
df.to_csv(PROCESSED_DATA_PATH, index=False)

print("\n--- Notebook 1 Complete ---")
print("Final DataFrame Info:")
df.info()


Saving cleaned and featured data to: ../data/processed/df_featured.csv

--- Notebook 1 Complete ---
Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 844338 entries, 0 to 1017190
Data columns (total 23 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   Store                      844338 non-null  int64         
 1   DayOfWeek                  844338 non-null  int64         
 2   Date                       844338 non-null  datetime64[ns]
 3   Sales                      844338 non-null  int64         
 4   Customers                  844338 non-null  int64         
 5   Open                       844338 non-null  int64         
 6   Promo                      844338 non-null  int64         
 7   StateHoliday               844338 non-null  object        
 8   SchoolHoliday              844338 non-null  int64         
 9   StoreType                  844338 non-null  object        
 1