In [1]:
import pandas as pd
from pathlib import Path

# Always show all columns when inspecting
pd.set_option("display.max_columns", None)

# Raw data folder path
PROJECT_ROOT = Path("C:/Users/User/Documents/Projects/dbt_rossmann_anayltics").resolve()
DATA_PATH = PROJECT_ROOT / "data_raw"

files = {
    "store": "store.csv",
    "train": "train.csv"
}

dfs = {}

for name, fname in files.items():
    path = DATA_PATH / fname      
    df = pd.read_csv(path)
    dfs[name] = df

  df = pd.read_csv(path)


In [2]:
# We name our train.csv as sales and other file.csv as stores
sales = dfs["train"].copy()
store = dfs["store"].copy()


In [3]:
# 1) Parse Date
sales["Date"] = pd.to_datetime(sales["Date"], format="%Y-%m-%d", errors="coerce")
bad_dates = sales["Date"].isna().sum()
print(f"[CHECK] Unparseable Date rows: {bad_dates:,}")
print("Date dtype:", sales["Date"].dtype)

[CHECK] Unparseable Date rows: 0
Date dtype: datetime64[ns]


In [4]:

# 2) Normalize StateHoliday (Rossmann gotcha: '0' vs 0)
sales["StateHoliday"] = sales["StateHoliday"].replace(
    {"0": "none", 0: "none", "a": "public", "b": "easter", "c": "christmas"}
)
allowed = {"none", "public", "easter", "christmas"}
unexpected = set(sales["StateHoliday"].dropna().unique()) - allowed
print(f"[CHECK] Unexpected StateHoliday values: {unexpected if unexpected else 'None ✅'}")


[CHECK] Unexpected StateHoliday values: None ✅


In [5]:
# derive day-of-week from Date (Mon=1..Sun=7)
dow_from_date = sales["Date"].dt.dayofweek + 1

mismatch = (
    sales["Date"].notna() &
    sales["DayOfWeek"].notna() &
    (sales["DayOfWeek"].astype(int) != dow_from_date.astype(int))
)

print("DayOfWeek mismatches:", mismatch.sum())

# preview mismatches if any
display(
    sales.loc[mismatch, ["Store", "Date", "DayOfWeek"]]
    .assign(dow_from_date=dow_from_date[mismatch])
    .head(10)
)

sales = sales.drop(columns=["DayOfWeek"])


DayOfWeek mismatches: 0


Unnamed: 0,Store,Date,DayOfWeek,dow_from_date


In [6]:

# 4) Optional: Open vs Sales sanity (cheap + meaningful)
closed_but_sales = (sales["Open"] == 0) & (sales["Sales"] > 0)
print(f"[CHECK] Open=0 but Sales>0 rows: {int(closed_but_sales.sum()):,}")

print("[INFO] Done. sales/store loaded + minimally normalized.")

[CHECK] Open=0 but Sales>0 rows: 0
[INFO] Done. sales/store loaded + minimally normalized.


In [7]:
print(sales.head())

   Store       Date  Sales  Customers  Open  Promo StateHoliday  SchoolHoliday
0      1 2015-07-31   5263        555     1      1         none              1
1      2 2015-07-31   6064        625     1      1         none              1
2      3 2015-07-31   8314        821     1      1         none              1
3      4 2015-07-31  13995       1498     1      1         none              1
4      5 2015-07-31   4822        559     1      1         none              1


In [8]:
print(sales.dtypes)

Store                     int64
Date             datetime64[ns]
Sales                     int64
Customers                 int64
Open                      int64
Promo                     int64
StateHoliday             object
SchoolHoliday             int64
dtype: object


In [9]:
# Strip column name whitespace (safe)
sales.columns = sales.columns.str.strip()
store.columns = store.columns.str.strip()

# Keep raw categoricals as-is (StoreType, Assortment, PromoInterval)

