In [2]:
# Step 1: Load & inspect
import pandas as pd
from pathlib import Path

# 1) adjust path if your file is elsewhere
csv_path = Path("synthetic_visits.csv")   # <-- change path if needed

# 2) read file
df = pd.read_csv(csv_path)

# 3) normalize column names (helpful if there are spaces/capitals/typos)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# 4) fix a few common typos / variants
rename_map = {}
for typo, correct in [
    ('qunatity','quantity'),
    ('quanity','quantity'),
    ('priceperunit','price_per_unit'),
    ('price_perunit','price_per_unit'),
    ('totalamount','total_amount')
]:
    if typo in df.columns and correct not in df.columns:
        rename_map[typo] = correct
if rename_map:
    df.rename(columns=rename_map, inplace=True)

# 5) quick overview
print("Columns:", df.columns.tolist())
print("\nRows, cols:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

print("\nData types and non-null counts:")
print(df.info())

# 6) try to parse date column (if present)
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    print("\nDate parse failures (na after to_datetime):", df['date'].isna().sum())
    if df['date'].notna().any():
        print("Date range:", df['date'].min(), "to", df['date'].max())

# 7) basic numeric checks
numeric_cols = [c for c in ['quantity','price_per_unit','total_amount','age'] if c in df.columns]
if numeric_cols:
    print("\nNumeric column summary:")
    display(df[numeric_cols].describe())

# 8) missing values per column
print("\nMissing values by column:")
print(df.isnull().sum().sort_values(ascending=False))

# 9) duplicates (by whole row and by transaction id if present)
print("\nFull-row duplicates:", df.duplicated().sum())
if 'transaction_id' in df.columns:
    print("Duplicate transaction_id count:", df['transaction_id'].duplicated().sum())

# 10) unique customers and sample counts
if 'customer_id' in df.columns:
    print("\nUnique customers:", df['customer_id'].nunique())
    print("Total transactions:", len(df))
    # distribution of transactions per customer (show top 5)
    display(df.groupby('customer_id').size().sort_values(ascending=False).head())

# 11) check basic consistency: quantity * price_per_unit â‰ˆ total_amount
if all(c in df.columns for c in ['quantity','price_per_unit','total_amount']):
    df['_computed_total'] = df['quantity'] * df['price_per_unit']
    # allow small floating point tolerance
    mismatch_mask = (~df['total_amount'].isna()) & (~df['_computed_total'].isna()) & ( (df['total_amount'] - df['_computed_total']).abs() > 1e-6 )
    print("\nRows where total_amount != quantity * price_per_unit (count):", mismatch_mask.sum())
    if mismatch_mask.sum() > 0:
        display(df.loc[mismatch_mask].head())

# 12) check negative / zero values that look invalid
for col in ['quantity','price_per_unit','total_amount','age']:
    if col in df.columns:
        bad = df[df[col].notna() & (df[col] <= 0)]
        if not bad.empty:
            print(f"\nWarning: {col} has non-positive values (showing up to 5 rows):")
            display(bad.head())

# 13) keep a copy to avoid accidental edits
df_original = df.copy()
print("\nFinished Step 1 checks. If everything looks good we proceed to cleaning + RFM creation.")


FileNotFoundError: [Errno 2] No such file or directory: 'synthetic_visits.csv'