In [69]:
# Instacart Project – Phase 1: Raw Data Exploration

import pandas as pd
import os

# Always show all columns when inspecting
pd.set_option("display.max_columns", None)

# Raw data folder path
DATA_PATH = r'C:\Users\User\Documents\Projects\instacart_project\data_raw'

os.listdir(DATA_PATH)

files = {
    "aisles": "aisles.csv",
    "departments": "departments.csv",
    "orders": "orders.csv",
    "order_products_prior": "order_products__prior.csv",
    "order_products_train": "order_products__train.csv",
    "products": "products.csv",
}

dfs = {}

for name, fname in files.items():
    path = os.path.join(DATA_PATH, fname)
    df = pd.read_csv(path)
    dfs[name] = df
    print(f"{name}: {df.shape[0]:,} rows × {df.shape[1]} columns")



aisles: 134 rows × 2 columns
departments: 21 rows × 2 columns
orders: 3,421,083 rows × 7 columns
order_products_prior: 32,434,489 rows × 4 columns
order_products_train: 1,384,617 rows × 4 columns
products: 49,688 rows × 4 columns


In [70]:
for name, df in dfs.items():
    print("=" * 50)
    print(name.upper())
    print(df.info())


AISLES
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   aisle_id  134 non-null    int64 
 1   aisle     134 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.2+ KB
None
DEPARTMENTS
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   department_id  21 non-null     int64 
 1   department     21 non-null     object
dtypes: int64(1), object(1)
memory usage: 468.0+ bytes
None
ORDERS
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            int

In [83]:
def quick_eda(df: pd.DataFrame, name: str, max_unique_for_preview: int = 20) -> None:
    print(f"\n================ {name} ================")
    print(f"Shape: {df.shape[0]} rows × {df.shape[1]} cols\n")
    print("Dtypes:")
    print(df.dtypes)
    print("\nMissing values:")
    print(df.isna().sum())
    
    print("\nSample rows:")
    display(df.head())
    
    print("\nColumns with few unique values (potential categories):")
    for col in df.columns:
        nunique = df[col].nunique(dropna=False)
        if nunique <= max_unique_for_preview:
            print(f"  {col:25s} → {nunique} unique")

quick_eda(dfs["orders"], "orders")
quick_eda(dfs["products"], "products")
quick_eda(dfs["aisles"], "aisles")
quick_eda(dfs["departments"], "departments")
quick_eda(dfs["order_products_prior"], "order_products_prior")
quick_eda(dfs["order_products_train"], "order_products_train")


Shape: 3421083 rows × 7 cols

Dtypes:
order_id                    int64
user_id                     int64
eval_set                   object
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

Missing values:
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

Sample rows:


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0



Columns with few unique values (potential categories):
  eval_set                  → 3 unique
  order_dow                 → 7 unique

Shape: 49688 rows × 4 cols

Dtypes:
product_id        int64
product_name     object
aisle_id          int64
department_id     int64
dtype: object

Missing values:
product_id       0
product_name     0
aisle_id         0
department_id    0
dtype: int64

Sample rows:


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13



Columns with few unique values (potential categories):

Shape: 134 rows × 2 cols

Dtypes:
aisle_id     int64
aisle       object
dtype: object

Missing values:
aisle_id    0
aisle       0
dtype: int64

Sample rows:


Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation



Columns with few unique values (potential categories):

Shape: 21 rows × 2 cols

Dtypes:
department_id     int64
department       object
dtype: object

Missing values:
department_id    0
department       0
dtype: int64

Sample rows:


Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol



Columns with few unique values (potential categories):

Shape: 32434489 rows × 4 cols

Dtypes:
order_id             int64
product_id           int64
add_to_cart_order    int64
reordered            int64
dtype: object

Missing values:
order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

Sample rows:


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0



Columns with few unique values (potential categories):
  reordered                 → 2 unique

Shape: 1384617 rows × 4 cols

Dtypes:
order_id             int64
product_id           int64
add_to_cart_order    int64
reordered            int64
dtype: object

Missing values:
order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

Sample rows:


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1



Columns with few unique values (potential categories):
  reordered                 → 2 unique


In [84]:
orders.head(60)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [None]:
# Checking keys and uniqueness
products = dfs["products"]
aisles = dfs["aisles"]
departments = dfs["departments"]

print("products unique",products["product_id"].is_unique)
print("Aisles unique:", aisles["aisle_id"].is_unique)
print("Departments unique:", departments["department_id"].is_unique)


orders table in more detail

In [None]:
order_products_prior = dfs["order_products_prior"]
order_products_prior["reordered"].value_counts().sort_index()



In [None]:
orders["order_dow"].value_counts()




In [None]:
orders["order_dow"].value_counts().sort_index()




In [None]:
orders["order_hour_of_day"].value_counts().sort_index().head(24)

In [None]:
orders["eval_set"].unique()


In [None]:
orders["eval_set"].value_counts()

In [None]:
orders[orders["eval_set"] == "train"].head()


In [None]:
orders[orders["eval_set"] == "test"].head()


In [76]:
orders["days_since_prior_order"].unique()


array([nan, 15., 21., 29., 28., 19., 20., 14.,  0., 30., 10.,  3.,  8.,
       13., 27.,  6.,  9., 12.,  7., 17., 11., 22.,  4.,  5.,  2., 23.,
       26., 25., 16.,  1., 18., 24.])

In [None]:
orders["days_since_prior_order"].isnull().sum()

np.int64(206209)

In [86]:
order_products_train = dfs["order_products_train"]
order_products_train["order_id"].value_counts()

order_id
2813632    80
1395075    80
949182     77
2869702    76
341238     76
           ..
1231029     1
1230813     1
2473878     1
3420798     1
1932460     1
Name: count, Length: 131209, dtype: int64

In [85]:
order_products_prior = dfs["order_products_prior"]
order_products_prior["order_id"].value_counts()

order_id
1564244    145
790903     137
61355      127
2970392    121
2069920    116
          ... 
1516802      1
1516762      1
2834932      1
2834976      1
2834979      1
Name: count, Length: 3214874, dtype: int64

In [82]:
orders["days_since_prior_order"].value_counts()

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

Are the order_id values the same across orders, order_products_prior, and order_products_train?

In [None]:
prior_ids = set(order_products_prior['order_id'])
orders_ids = set(orders['order_id'])

missing_in_orders = prior_ids - orders_ids
print(len(missing_in_orders))

train_ids = set(order_products_train['order_id'])
missing_in_orders_train = train_ids - orders_ids
print(len(missing_in_orders_train))


0


In [91]:
test_merge = order_products_prior.merge(orders[['order_id']], on='order_id', how='left')
test_merge['order_id'].isna().sum()


np.int64(0)