# Open Food Facts — FOOD CSV quick scan

This notebook reads the **Open Food Facts food export** (tab-separated, `*.csv` or `*.csv.gz`),
computes missing-value stats, and shows a preview of the first 10 rows.

## Objectives
 The objectives of this file is to do a quick scan and cleaning of the dataset, inside a panda dataframe. After that, the goal will be to make the dataset ML ready.

In [7]:
# ---- Config ----
from pathlib import Path

# Point to your local OFF food CSV (TAB-separated). Supports .csv or .csv.gz
OFF_PATH = Path("../data/openfoodfacts/en.openfoodfacts.org.products.csv")

# Optional: only load a subset of columns (speeds up). Example:
USECOLS = []  # e.g.: ["code", "product_name", "brands", "countries", "nutriscore_grade"]

# Optional: limit rows for a quick scan. 0 = read all rows
LIMIT = 0

# How many top missing columns to display
TOPK = 10


In [8]:
# ---- Imports & helpers ----
import gzip
import pandas as pd
import numpy as np

EXTRA_MISS = {"", " ", "[]", "{}", "unknown", "UNKNOWN", "na", "NA", "null", "NULL"}

def open_csv(path: Path):
    if str(path).endswith(".gz"):
        return gzip.open(path, "rt", encoding="utf-8", errors="replace")
    return open(path, "r", encoding="utf-8", errors="replace")


In [9]:
# ---- Load & compute missing stats ----
if not OFF_PATH.exists():
    raise FileNotFoundError(f"Input file not found: {OFF_PATH.resolve()}")

usecols = [c.strip() for c in USECOLS if str(c).strip()] or None
nrows = LIMIT if (isinstance(LIMIT, int) and LIMIT > 0) else None

with open_csv(OFF_PATH) as f:
    df = pd.read_csv(
        f,
        sep="\t",
        dtype="string",
        usecols=usecols,
        nrows=nrows,
        na_values=list(EXTRA_MISS),
        keep_default_na=True,
        low_memory=False,
    )

# Normalize blanks to NA (after stripping)
for c in df.columns:
    s = df[c]
    if pd.api.types.is_string_dtype(s):
        df[c] = s.str.strip().replace("", pd.NA)

total = len(df)
miss_counts = df.isna().sum()
miss_pct = (miss_counts / total * 100.0).round(1)

order = (
    pd.DataFrame({'column': df.columns, 'missing_count': miss_counts, 'missing_pct': miss_pct})
    .sort_values(['missing_pct', 'missing_count', 'column'], ascending=[False, False, True])
    .reset_index(drop=True)
)

print(f"rows: {total}")
print(f"columns: {df.shape[1]}")

print(f"\nTop {TOPK} columns by missing %:")
for _, r in order.head(int(TOPK)).iterrows():
    print(f"- {r['column']}: {int(r['missing_count'])} missing ({r['missing_pct']:.1f}%)")


ParserError: Error tokenizing data. C error: Expected 214 fields in line 1824798, saw 246


## Preview — first 10 rows

In [None]:
# This cell shows the first 10 rows as a pandas DataFrame
df.head(10)

NameError: name 'df' is not defined

### Print columns to see wich one i can ditch

In [None]:
# --- List all column names in the dataset ---
print("Total columns:", len(df.columns))
for c in df.columns:
    print(c)


NameError: name 'df' is not defined