In [30]:
import pandas as pd
from pathlib import Path

# Always show all columns when inspecting
pd.set_option("display.max_columns", None)

# Raw data folder path
PROJECT_ROOT = Path("C:/Users/User/Documents/Projects/dbt_rossmann_anayltics").resolve()
DATA_PATH = PROJECT_ROOT / "data_raw"

files = {
    "store": "store.csv",
    "train": "train.csv"
}

dfs = {}

for name, fname in files.items():
    path = DATA_PATH / fname      
    df = pd.read_csv(path)
    dfs[name] = df
    print(f"{name}: {df.shape[0]:,} rows × {df.shape[1]} columns")


store: 1,115 rows × 10 columns
train: 1,017,209 rows × 9 columns


  df = pd.read_csv(path)


In [31]:
for name, df in dfs.items():
    print("=" * 50)
    print(name.upper())
    print(df.info())

STORE
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB
None
TRAIN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
 #   Column         Non-Null 

In [32]:
import pandas as pd

def quick_edas(df: pd.DataFrame,name: str,max_unique_for_preview: int = 20) -> None:
    print(f"\n================ {name} ================")
    print(f"Shape: {df.shape[0]} rows × {df.shape[1]} cols\n")

    # Dtypes
    print("Dtypes")
    print(df.dtypes)

    # Missing values
    print("\nMissing values:")
    print(df.isna().sum())

    # Sample rows
    print("\nSample rows:")
    display(df.head())

    # Low-cardinality columns (potential categoricals)
    print("\nColumns with few unique values (potential categories):")
    low_card_cols = []
    for col in df.columns:
        nunique = df[col].nunique(dropna=False)
        if nunique <= max_unique_for_preview:
            low_card_cols.append(col)
            print(f"  {col:25s} → {nunique} unique")

    # Categorical value preview
    if low_card_cols:
        print("\nCategorical value preview:")
        for col in low_card_cols:
            print(f"\n--- {col} ---")
            print(df[col].value_counts(dropna=False))

    # Generic key uniqueness hints
    print("\nKey uniqueness check (single-column candidates):")
    for col in df.columns:
        if df[col].is_unique:
            print(f"  ✔ {col} is unique (candidate primary key)")


In [33]:
quick_edas(dfs["store"], "store")
quick_edas(dfs["train"], "train")


Shape: 1115 rows × 10 cols

Dtypes
Store                          int64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
dtype: object

Missing values:
Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64

Sample rows:


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,



Columns with few unique values (potential categories):
  StoreType                 → 4 unique
  Assortment                → 3 unique
  CompetitionOpenSinceMonth → 13 unique
  Promo2                    → 2 unique
  Promo2SinceYear           → 8 unique
  PromoInterval             → 4 unique

Categorical value preview:

--- StoreType ---
StoreType
a    602
d    348
c    148
b     17
Name: count, dtype: int64

--- Assortment ---
Assortment
a    593
c    513
b      9
Name: count, dtype: int64

--- CompetitionOpenSinceMonth ---
CompetitionOpenSinceMonth
NaN     354
9.0     125
4.0      94
11.0     92
3.0      70
7.0      67
12.0     64
10.0     61
6.0      50
5.0      44
2.0      41
8.0      39
1.0      14
Name: count, dtype: int64

--- Promo2 ---
Promo2
1    571
0    544
Name: count, dtype: int64

--- Promo2SinceYear ---
Promo2SinceYear
NaN       544
2011.0    128
2013.0    120
2014.0     95
2012.0     81
2009.0     73
2010.0     64
2015.0     10
Name: count, dtype: int64

--- PromoInterva

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1



Columns with few unique values (potential categories):
  DayOfWeek                 → 7 unique
  Open                      → 2 unique
  Promo                     → 2 unique
  StateHoliday              → 5 unique
  SchoolHoliday             → 2 unique

Categorical value preview:

--- DayOfWeek ---
DayOfWeek
5    145845
4    145845
3    145665
2    145664
1    144730
7    144730
6    144730
Name: count, dtype: int64

--- Open ---
Open
1    844392
0    172817
Name: count, dtype: int64

--- Promo ---
Promo
0    629129
1    388080
Name: count, dtype: int64

--- StateHoliday ---
StateHoliday
0    855087
0    131072
a     20260
b      6690
c      4100
Name: count, dtype: int64

--- SchoolHoliday ---
SchoolHoliday
0    835488
1    181721
Name: count, dtype: int64

Key uniqueness check (single-column candidates):


In [34]:
# inspect StateHoliday
df = dfs["train"]

df["StateHoliday"].map(lambda x: (x, type(x))).value_counts()


StateHoliday
(0, <class 'str'>)    855087
(0, <class 'int'>)    131072
(a, <class 'str'>)     20260
(b, <class 'str'>)      6690
(c, <class 'str'>)      4100
Name: count, dtype: int64