In [2]:
import pandas as pd

files = {
    "campaigns": "dataset_fashion_store_campaigns.csv",
    "channels": "dataset_fashion_store_channels.csv",
    "customers": "dataset_fashion_store_customers.csv",
    "products": "dataset_fashion_store_products.csv",
    "sales": "dataset_fashion_store_sales.csv",
    "salesitems": "dataset_fashion_store_salesitems.csv",
    "stock": "dataset_fashion_store_stock.csv",
}

dfs = {}
for name, path in files.items():
    df = pd.read_csv(path)
    dfs[name] = df
    print(f"{name:10s} -> rows: {len(df):5d} | cols: {df.shape[1]:2d}")

list(dfs.keys())  # para confirmar que existe

campaigns  -> rows:     7 | cols:  7
channels   -> rows:     2 | cols:  2
customers  -> rows:  1000 | cols:  4
products   -> rows:   500 | cols:  9
sales      -> rows:   905 | cols:  7
salesitems -> rows:  2253 | cols: 13
stock      -> rows:  1000 | cols:  3


['campaigns',
 'channels',
 'customers',
 'products',
 'sales',
 'salesitems',
 'stock']

In [3]:
# salesitems.product_id debe existir en products.product_id
if {"salesitems","products"} <= dfs.keys():
    missing_products = dfs["salesitems"][~dfs["salesitems"]["product_id"].isin(dfs["products"]["product_id"])]
    print("FK check: salesitems.product_id not in products ->", len(missing_products))

# salesitems.sale_id debe existir en sales.sale_id
if {"salesitems","sales"} <= dfs.keys():
    missing_sales = dfs["salesitems"][~dfs["salesitems"]["sale_id"].isin(dfs["sales"]["sale_id"])]
    print("FK check: salesitems.sale_id not in sales ->", len(missing_sales))

# sales.customer_id debe existir en customers.customer_id
if {"sales","customers"} <= dfs.keys():
    missing_customers = dfs["sales"][~dfs["sales"]["customer_id"].isin(dfs["customers"]["customer_id"])]
    print("FK check: sales.customer_id not in customers ->", len(missing_customers))

FK check: salesitems.product_id not in products -> 0
FK check: salesitems.sale_id not in sales -> 0
FK check: sales.customer_id not in customers -> 0


In [4]:
import pandas as pd
from datetime import datetime

# Convierte fechas si existen
if "sales" in dfs and "sale_date" in dfs["sales"].columns:
    dfs["sales"]["sale_date"] = pd.to_datetime(dfs["sales"]["sale_date"], errors="coerce")

if "campaigns" in dfs:
    for col in ["start_date", "end_date"]:
        if col in dfs["campaigns"].columns:
            dfs["campaigns"][col] = pd.to_datetime(dfs["campaigns"][col], errors="coerce")

# Rango temporal de ventas y campañas
def date_range(series):
    return series.min(), series.max()

if "sales" in dfs and "sale_date" in dfs["sales"].columns:
    smin, smax = date_range(dfs["sales"]["sale_date"])
    print(f"Sales date range:     {smin}  →  {smax}")

if "campaigns" in dfs and {"start_date","end_date"}.issubset(dfs["campaigns"].columns):
    cmin, cmax = dfs["campaigns"]["start_date"].min(), dfs["campaigns"]["end_date"].max()
    print(f"Campaigns date range: {cmin}  →  {cmax}")

Sales date range:     2025-04-04 00:00:00  →  2025-06-17 00:00:00
Campaigns date range: 2025-04-01 00:00:00  →  2025-06-17 00:00:00


In [5]:
import numpy as np

def quick_profile(name, df):
    print("="*100)
    print(f"[{name.upper()}] shape={df.shape}")
    print("dtypes:\n", df.dtypes.to_string(), "\n")
    print("nulls:\n", df.isnull().sum().to_string(), "\n")
    print("duplicated rows (all cols):", df.duplicated().sum())
    num = df.select_dtypes(include=[np.number])
    if not num.empty:
        print("\nnumeric describe:\n", num.describe().T, "\n")

for n, d in dfs.items():
    quick_profile(n, d)

[CAMPAIGNS] shape=(7, 7)
dtypes:
 campaign_id                int64
campaign_name             object
start_date        datetime64[ns]
end_date          datetime64[ns]
channel                   object
discount_type             object
discount_value            object 

nulls:
 campaign_id       0
campaign_name     0
start_date        0
end_date          0
channel           0
discount_type     0
discount_value    0 

duplicated rows (all cols): 0

numeric describe:
              count  mean       std  min  25%  50%  75%  max
campaign_id    7.0   4.0  2.160247  1.0  2.5  4.0  5.5  7.0 

[CHANNELS] shape=(2, 2)
dtypes:
 channel        object
description    object 

nulls:
 channel        0
description    0 

duplicated rows (all cols): 0
[CUSTOMERS] shape=(1000, 4)
dtypes:
 customer_id     int64
country        object
age_range      object
signup_date    object 

nulls:
 customer_id    0
country        0
age_range      0
signup_date    0 

duplicated rows (all cols): 0

numeric describe:
    

In [6]:
# salesitems.product_id debe existir en products.product_id
if {"salesitems","products"} <= dfs.keys():
    missing_products = dfs["salesitems"][~dfs["salesitems"]["product_id"].isin(dfs["products"]["product_id"])]
    print("FK check: salesitems.product_id NOT in products ->", len(missing_products))

# salesitems.sale_id debe existir en sales.sale_id
if {"salesitems","sales"} <= dfs.keys():
    missing_sales = dfs["salesitems"][~dfs["salesitems"]["sale_id"].isin(dfs["sales"]["sale_id"])]
    print("FK check: salesitems.sale_id NOT in sales ->", len(missing_sales))

# sales.customer_id debe existir en customers.customer_id
if {"sales","customers"} <= dfs.keys():
    missing_customers = dfs["sales"][~dfs["sales"]["customer_id"].isin(dfs["customers"]["customer_id"])]
    print("FK check: sales.customer_id NOT in customers ->", len(missing_customers))

FK check: salesitems.product_id NOT in products -> 0
FK check: salesitems.sale_id NOT in sales -> 0
FK check: sales.customer_id NOT in customers -> 0


In [7]:
# Cantidades <= 0
if "salesitems" in dfs and "quantity" in dfs["salesitems"].columns:
    bad_qty = dfs["salesitems"][dfs["salesitems"]["quantity"] <= 0]
    print("salesitems: quantity <= 0 ->", len(bad_qty))
    display(bad_qty.head())

# Descuentos negativos o >100
if "salesitems" in dfs:
    si = dfs["salesitems"]
    if "discount_percent" in si.columns:
        bad_disc = si[(si["discount_percent"] < 0) | (si["discount_percent"] > 100)]
        print("salesitems: invalid discount_percent ->", len(bad_disc))
        display(bad_disc.head())

# Precios de producto <= 0
if "products" in dfs and "catalog_price" in dfs["products"].columns:
    bad_price = dfs["products"][dfs["products"]["catalog_price"] <= 0]
    print("products: catalog_price <= 0 ->", len(bad_price))
    display(bad_price.head())

salesitems: quantity <= 0 -> 0


Unnamed: 0,item_id,sale_id,product_id,quantity,original_price,unit_price,discount_applied,discount_percent,discounted,item_total,sale_date,channel,channel_campaigns


TypeError: '<' not supported between instances of 'str' and 'int'

In [8]:
def top_uniques(df, cols, top=20):
    for c in cols:
        if c in df.columns:
            print(f"\n--- {c} uniques ({df[c].nunique()}):")
            print(df[c].value_counts(dropna=False).head(top))

top_uniques(dfs["products"], ["category","brand","gender","color","size"])
if "channels" in dfs:
    top_uniques(dfs["channels"], ["channel"])
if "customers" in dfs:
    top_uniques(dfs["customers"], ["country","age_range"])


--- category uniques (5):
category
Dresses      109
T-Shirts     108
Sleepwear    104
Shoes        100
Pants         79
Name: count, dtype: int64

--- brand uniques (1):
brand
Tiva    500
Name: count, dtype: int64

--- gender uniques (1):
gender
Female    500
Name: count, dtype: int64

--- color uniques (5):
color
Black    104
Green    102
Blue     102
Red       98
White     94
Name: count, dtype: int64

--- size uniques (9):
size
XS    107
S      84
XL     81
L      76
M      72
36     25
38     20
40     19
35     16
Name: count, dtype: int64

--- channel uniques (2):
channel
E-commerce    1
App Mobile    1
Name: count, dtype: int64

--- country uniques (6):
country
France         221
Germany        212
Italy          192
Netherlands    162
Spain          143
Portugal        70
Name: count, dtype: int64

--- age_range uniques (5):
age_range
16-25    207
26-35    206
36-45    204
46-55    198
56-65    185
Name: count, dtype: int64


In [9]:
from io import StringIO

report = StringIO()

def write(s=""):
    report.write(str(s) + "\n")

# Resumen por dataset
for name, df in dfs.items():
    write("="*100)
    write(f"[{name.upper()}] shape={df.shape}")
    write("dtypes:\n" + df.dtypes.to_string() + "\n")
    write("nulls:\n" + df.isnull().sum().to_string() + "\n")
    write(f"duplicated rows: {df.duplicated().sum()}\n")
    num = df.select_dtypes(include=[np.number])
    if not num.empty:
        write("numeric describe:\n" + num.describe().T.to_string() + "\n")

# Relaciones
if {"salesitems","products"} <= dfs.keys():
    write(f"FK salesitems.product_id not in products: {len(missing_products)}")
if {"salesitems","sales"} <= dfs.keys():
    write(f"FK salesitems.sale_id not in sales: {len(missing_sales)}")
if {"sales","customers"} <= dfs.keys():
    write(f"FK sales.customer_id not in customers: {len(missing_customers)}")

# Rangos de fechas
if "sales" in dfs and "sale_date" in dfs["sales"].columns:
    write(f"Sales date range: {dfs['sales']['sale_date'].min()} -> {dfs['sales']['sale_date'].max()}")
if "campaigns" in dfs and {"start_date","end_date"}.issubset(dfs["campaigns"].columns):
    write(f"Campaigns date range: {dfs['campaigns']['start_date'].min()} -> {dfs['campaigns']['end_date'].max()}")

# Guardar
with open("profiling_report.txt","w", encoding="utf-8") as f:
    f.write(report.getvalue())

print("Saved profiling_report.txt")

Saved profiling_report.txt


In [10]:
master_preview = (
    dfs["salesitems"]
      .merge(dfs["sales"], on="sale_id", how="left")
      .merge(dfs["products"], on="product_id", how="left")
      .merge(dfs["customers"], on="customer_id", how="left")
)
master_preview.head(10)

Unnamed: 0,item_id,sale_id,product_id,quantity,original_price,unit_price,discount_applied,discount_percent,discounted_x,item_total,...,category,brand,color,size,catalog_price,cost_price,gender,country_y,age_range,signup_date
0,2270,658,403,1,81.8,81.8,0.0,0.00%,0,81.8,...,Dresses,Tiva,Red,L,81.8,45.12,Female,Portugal,46-55,2025-04-26
1,1170,336,284,1,81.79,81.79,0.0,0.00%,0,81.79,...,Shoes,Tiva,White,35,81.79,35.02,Female,France,16-25,2025-04-26
2,2496,1255,71,1,80.76,80.76,0.0,0.00%,0,80.76,...,Pants,Tiva,Red,XL,80.76,51.01,Female,Germany,36-45,2025-04-14
3,1273,331,98,1,78.52,78.52,0.0,0.00%,0,78.52,...,Shoes,Tiva,Black,38,78.52,41.48,Female,Italy,26-35,2025-01-30
4,1829,1079,98,1,78.52,78.52,0.0,0.00%,0,78.52,...,Shoes,Tiva,Black,38,78.52,41.48,Female,Germany,46-55,2025-03-02
5,3221,478,98,1,78.52,78.52,0.0,0.00%,0,78.52,...,Shoes,Tiva,Black,38,78.52,41.48,Female,France,16-25,2025-01-18
6,2442,1082,257,1,77.9,77.9,0.0,0.00%,0,77.9,...,Shoes,Tiva,Blue,40,77.9,53.04,Female,Portugal,46-55,2025-05-21
7,730,747,413,1,76.15,76.15,0.0,0.00%,0,76.15,...,Shoes,Tiva,Black,40,76.15,52.18,Female,Germany,16-25,2025-04-22
8,2917,258,392,1,75.47,75.47,0.0,0.00%,0,75.47,...,Shoes,Tiva,Black,40,75.47,39.53,Female,France,16-25,2025-04-04
9,3142,1144,392,1,75.47,75.47,0.0,0.00%,0,75.47,...,Shoes,Tiva,Black,40,75.47,39.53,Female,Portugal,16-25,2025-03-08


In [11]:
for name, df in dfs.items():
    print(f"\n{name.upper()} top uniques")
    print(df.nunique())


CAMPAIGNS top uniques
campaign_id       7
campaign_name     7
start_date        7
end_date          7
channel           4
discount_type     2
discount_value    5
dtype: int64

CHANNELS top uniques
channel        2
description    2
dtype: int64

CUSTOMERS top uniques
customer_id    1000
country           6
age_range         5
signup_date     352
dtype: int64

PRODUCTS top uniques
product_id       500
product_name     500
category           5
brand              1
color              5
size               9
catalog_price    475
cost_price       456
gender             1
dtype: int64

SALES top uniques
sale_id         905
channel           2
discounted        2
total_amount    898
sale_date        51
customer_id     580
country           6
dtype: int64

SALESITEMS top uniques
item_id              2253
sale_id               905
product_id            499
quantity                5
original_price        475
unit_price            638
discount_applied      182
discount_percent        3
discounted 

In [12]:
products[products["catalog_price"] <= 0]

NameError: name 'products' is not defined

In [13]:
salesitems[
    (salesitems["discount_percent"] < 0) | 
    (salesitems["discount_percent"] > 100)
]

NameError: name 'salesitems' is not defined

In [1]:
products[products["catalog_price"] <= 0]

NameError: name 'products' is not defined

In [2]:
salesitems[
    (salesitems["discount_percent"] < 0) | 
    (salesitems["discount_percent"] > 100)
]

NameError: name 'salesitems' is not defined

In [3]:
products["catalog_price"].describe()
salesitems["unit_price"].describe()


NameError: name 'products' is not defined

In [4]:
import pandas as pd

products = pd.read_csv("dataset_fashion_store_products.csv")
salesitems = pd.read_csv("dataset_fashion_store_salesitems.csv")

print(products.shape)
print(salesitems.shape)

(500, 9)
(2253, 13)


In [5]:
# 1. Precios <= 0 en catálogo (productos)
products[products["catalog_price"] <= 0]

Unnamed: 0,product_id,product_name,category,brand,color,size,catalog_price,cost_price,gender


In [6]:
# 2. Descuentos negativos o mayores de 100% (salesitems)
salesitems[
    (salesitems["discount_percent"] < 0) |
    (salesitems["discount_percent"] > 100)
]

TypeError: '<' not supported between instances of 'str' and 'int'

In [7]:
# 3. Estadísticas generales
products["catalog_price"].describe()
salesitems["unit_price"].describe()

count    2253.000000
mean       48.141074
std        13.373302
min        11.680000
25%        38.490000
50%        47.730000
75%        57.350000
max        85.900000
Name: unit_price, dtype: float64

In [8]:
salesitems["discount_percent"].dtype

dtype('O')

In [9]:
salesitems["discount_percent"].unique()[:50]  # ver primeros 50 valores únicos

array(['0.00%', '10.00%', '30.00%'], dtype=object)

In [10]:
salesitems[salesitems["discount_percent"].apply(lambda x: isinstance(x, str))].head(20)

Unnamed: 0,item_id,sale_id,product_id,quantity,original_price,unit_price,discount_applied,discount_percent,discounted,item_total,sale_date,channel,channel_campaigns
0,2270,658,403,1,81.8,81.8,0.0,0.00%,0,81.8,2025-06-16,App Mobile,App Mobile
1,1170,336,284,1,81.79,81.79,0.0,0.00%,0,81.79,2025-06-17,E-commerce,Website Banner
2,2496,1255,71,1,80.76,80.76,0.0,0.00%,0,80.76,2025-04-16,App Mobile,App Mobile
3,1273,331,98,1,78.52,78.52,0.0,0.00%,0,78.52,2025-05-06,App Mobile,App Mobile
4,1829,1079,98,1,78.52,78.52,0.0,0.00%,0,78.52,2025-06-15,App Mobile,App Mobile
5,3221,478,98,1,78.52,78.52,0.0,0.00%,0,78.52,2025-04-14,App Mobile,App Mobile
6,2442,1082,257,1,77.9,77.9,0.0,0.00%,0,77.9,2025-04-06,App Mobile,App Mobile
7,730,747,413,1,76.15,76.15,0.0,0.00%,0,76.15,2025-04-21,App Mobile,App Mobile
8,2917,258,392,1,75.47,75.47,0.0,0.00%,0,75.47,2025-05-29,E-commerce,Website Banner
9,3142,1144,392,1,75.47,75.47,0.0,0.00%,0,75.47,2025-05-20,E-commerce,Website Banner


In [12]:
salesitems["discount_percent"] = pd.to_numeric(
    salesitems["discount_percent"], 
    errors="coerce"
)

In [13]:
salesitems["discount_percent"] = pd.to_numeric(
    salesitems["discount_percent"], 
    errors="coerce"
)

In [14]:
salesitems[
    (salesitems["discount_percent"] < 0) |
    (salesitems["discount_percent"] > 100)
]

Unnamed: 0,item_id,sale_id,product_id,quantity,original_price,unit_price,discount_applied,discount_percent,discounted,item_total,sale_date,channel,channel_campaigns


In [15]:
salesitems["discount_percent"].isna().sum()

np.int64(2253)

In [16]:
[col for col in salesitems.columns if "discount" in col.lower()]

salesitems.head(3)[["original_price","unit_price","discount_applied"]]

Unnamed: 0,original_price,unit_price,discount_applied
0,81.8,81.8,0.0
1,81.79,81.79,0.0
2,80.76,80.76,0.0


In [17]:
import pandas as pd
import numpy as np

for col in ["original_price","unit_price","quantity","discount_applied"]:
    if col in salesitems.columns:
        salesitems[col] = pd.to_numeric(salesitems[col], errors="coerce")

In [18]:
import pandas as pd
import numpy as np

for col in ["original_price","unit_price","quantity","discount_applied"]:
    if col in salesitems.columns:
        salesitems[col] = pd.to_numeric(salesitems[col], errors="coerce")

In [19]:
salesitems["discount_percent_calc"] = np.where(
    salesitems["original_price"] > 0,
    (salesitems["original_price"] - salesitems["unit_price"]) / salesitems["original_price"] * 100,
    np.nan
)
salesitems[["original_price","unit_price","discount_applied","discount_percent_calc"]].head()

Unnamed: 0,original_price,unit_price,discount_applied,discount_percent_calc
0,81.8,81.8,0.0,0.0
1,81.79,81.79,0.0,0.0
2,80.76,80.76,0.0,0.0
3,78.52,78.52,0.0,0.0
4,78.52,78.52,0.0,0.0


In [20]:
# Cantidades imposibles
bad_qty = salesitems[salesitems["quantity"] <= 0]

# Descuentos fuera de rango
bad_disc_pct = salesitems[
    (salesitems["discount_percent_calc"] < 0) |
    (salesitems["discount_percent_calc"] > 100)
]

# Incoherencias entre flag y cálculo
bad_flag_vs_calc = salesitems[
    ((salesitems["discount_applied"] == 1) & (salesitems["discount_percent_calc"].fillna(0) == 0)) |
    ((salesitems["discount_applied"] == 0) & (salesitems["discount_percent_calc"].fillna(0) > 0))
]

len(bad_qty), len(bad_disc_pct), len(bad_flag_vs_calc)

(0, 0, 0)

In [21]:

import numpy as np

def outlier_mask(s, low=0.01, high=0.99):
    ql, qh = s.quantile(low), s.quantile(high)
    return (s < ql) | (s > qh)

price_out = outlier_mask(salesitems["unit_price"])
qty_out   = outlier_mask(salesitems["quantity"])
disc_out  = outlier_mask(salesitems["discount_percent_calc"].dropna())

price_out.sum(), qty_out.sum(), disc_out.sum()

(np.int64(46), np.int64(0), np.int64(23))

In [22]:
salesitems[price_out].sort_values("unit_price").head(10)

Unnamed: 0,item_id,sale_id,product_id,quantity,original_price,unit_price,discount_applied,discount_percent,discounted,item_total,sale_date,channel,channel_campaigns,discount_percent_calc
1807,896,376,195,4,16.69,11.68,5.01,,1,46.73,2025-05-19,App Mobile,App Mobile,30.017975
939,715,1042,195,2,16.69,11.68,5.01,,1,23.37,2025-05-13,App Mobile,Social Media,30.017975
454,1913,237,64,1,13.51,13.51,0.0,,0,13.51,2025-05-16,E-commerce,Website Banner,0.0
938,3337,1304,198,2,14.64,14.64,0.0,,0,29.28,2025-04-14,App Mobile,App Mobile,0.0
1806,1537,193,198,4,14.64,14.64,0.0,,0,58.56,2025-05-18,E-commerce,Website Banner,0.0
1346,2414,1243,198,3,14.64,14.64,0.0,,0,43.92,2025-05-19,E-commerce,Website Banner,0.0
1345,466,425,198,3,14.64,14.64,0.0,,0,43.92,2025-04-16,E-commerce,Website Banner,0.0
937,197,1109,198,2,14.64,14.64,0.0,,0,29.28,2025-05-27,E-commerce,Website Banner,0.0
453,813,1116,198,1,14.64,14.64,0.0,,0,14.64,2025-05-21,E-commerce,Website Banner,0.0
452,406,376,114,1,21.53,15.07,6.46,,1,15.07,2025-05-19,App Mobile,Social Media,30.004645


In [23]:
salesitems[disc_out].sort_values("discount_percent_calc").head(10)

Unnamed: 0,item_id,sale_id,product_id,quantity,original_price,unit_price,discount_applied,discount_percent,discounted,item_total,sale_date,channel,channel_campaigns,discount_percent_calc
1795,3271,440,454,4,35.16,24.61,10.55,,1,98.45,2025-05-15,App Mobile,App Mobile,30.005688
859,767,552,11,2,51.09,35.76,15.33,,1,71.53,2025-05-15,App Mobile,Social Media,30.005872
1584,1509,1337,315,4,67.02,46.91,20.11,,1,187.66,2025-05-18,App Mobile,Social Media,30.005968
711,384,1226,478,2,66.52,46.56,19.96,,1,93.13,2025-05-18,App Mobile,Social Media,30.006013
868,734,921,268,2,49.59,34.71,14.88,,1,69.43,2025-05-19,App Mobile,Social Media,30.00605
390,358,619,228,1,46.89,32.82,14.07,,1,32.82,2025-05-15,App Mobile,Social Media,30.006398
2083,1160,1059,428,5,61.22,42.85,18.37,,1,214.27,2025-05-18,App Mobile,App Mobile,30.006534
2084,2942,286,352,5,61.12,42.78,18.34,,1,213.92,2025-05-18,App Mobile,App Mobile,30.006545
2092,153,1148,191,5,60.52,42.36,18.16,,1,211.82,2025-05-13,App Mobile,App Mobile,30.006609
1762,2800,297,404,4,45.19,31.63,13.56,,1,126.53,2025-05-17,App Mobile,Social Media,30.006639


In [24]:
# ✅ 1. Importar numpy (por si no está ya)
import numpy as np

# ✅ 2. Definir la función para detectar outliers por percentiles
def outlier_mask(s, low=0.01, high=0.99):
    ql, qh = s.quantile(low), s.quantile(high)
    return (s < ql) | (s > qh)

# ✅ 3. Generar máscaras para cada variable importante
price_out_mask = outlier_mask(salesitems["unit_price"])
qty_out_mask   = outlier_mask(salesitems["quantity"])
disc_out_mask  = outlier_mask(salesitems["discount_percent_calc"].dropna())

# ✅ 4. Crear columnas nuevas para marcar outliers (True/False)
salesitems["is_price_outlier"] = price_out_mask
salesitems["is_quantity_outlier"] = qty_out_mask
salesitems["is_discount_outlier"] = salesitems["discount_percent_calc"].apply(
    lambda x: x < salesitems["discount_percent_calc"].quantile(0.01) or 
              x > salesitems["discount_percent_calc"].quantile(0.99)
    if not np.isnan(x) else False
)

# ✅ 5. Verificar cuántos outliers hay en cada caso
print("Price outliers:", salesitems["is_price_outlier"].sum())
print("Quantity outliers:", salesitems["is_quantity_outlier"].sum())
print("Discount outliers:", salesitems["is_discount_outlier"].sum())

# ✅ 6. Mostrar algunos ejemplos de filas marcadas como outliers
salesitems[salesitems["is_price_outlier"] | salesitems["is_discount_outlier"]].head(10)

Price outliers: 46
Quantity outliers: 0
Discount outliers: 23


Unnamed: 0,item_id,sale_id,product_id,quantity,original_price,unit_price,discount_applied,discount_percent,discounted,item_total,sale_date,channel,channel_campaigns,discount_percent_calc,is_price_outlier,is_quantity_outlier,is_discount_outlier
0,2270,658,403,1,81.8,81.8,0.0,,0,81.8,2025-06-16,App Mobile,App Mobile,0.0,True,False,False
1,1170,336,284,1,81.79,81.79,0.0,,0,81.79,2025-06-17,E-commerce,Website Banner,0.0,True,False,False
2,2496,1255,71,1,80.76,80.76,0.0,,0,80.76,2025-04-16,App Mobile,App Mobile,0.0,True,False,False
390,358,619,228,1,46.89,32.82,14.07,,1,32.82,2025-05-15,App Mobile,Social Media,30.006398,False,False,True
446,1068,428,51,1,25.38,17.77,7.61,,1,17.77,2025-05-14,App Mobile,Social Media,29.98424,True,False,False
447,2405,588,14,1,24.96,17.47,7.49,,1,17.47,2025-05-14,App Mobile,Social Media,30.008013,True,False,True
448,1340,24,205,1,24.05,16.84,7.22,,1,16.84,2025-05-13,App Mobile,Social Media,29.97921,True,False,False
449,2382,172,443,1,23.13,16.19,6.94,,1,16.19,2025-05-13,App Mobile,Social Media,30.004323,True,False,False
450,1102,558,164,1,22.96,16.07,6.89,,1,16.07,2025-05-17,App Mobile,Social Media,30.008711,True,False,True
451,433,428,491,1,22.66,15.86,6.8,,1,15.86,2025-05-14,App Mobile,Social Media,30.008826,True,False,True


In [25]:
salesitems["unit_price"].describe()

count    2253.000000
mean       48.141074
std        13.373302
min        11.680000
25%        38.490000
50%        47.730000
75%        57.350000
max        85.900000
Name: unit_price, dtype: float64

In [26]:
min_price = salesitems["unit_price"].min()
max_price = salesitems["unit_price"].max()

print("Precio mínimo:", min_price)
print("Precio máximo:", max_price)

Precio mínimo: 11.68
Precio máximo: 85.9


In [27]:
import matplotlib.pyplot as plt

salesitems["unit_price"].plot(kind="box", figsize=(5,6), title="Unit Price Boxplot")
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [28]:
num_products = products["product_id"].nunique()
print("Número de productos únicos:", num_products)

Número de productos únicos: 500


In [29]:
num_categories = products["category"].nunique()
print("Número de categorías únicas:", num_categories)

Número de categorías únicas: 5


In [30]:
products["category"].value_counts()

category
Dresses      109
T-Shirts     108
Sleepwear    104
Shoes        100
Pants         79
Name: count, dtype: int64

In [31]:
len(salesitems)

2253

In [32]:
salesitems.dtypes

item_id                    int64
sale_id                    int64
product_id                 int64
quantity                   int64
original_price           float64
unit_price               float64
discount_applied         float64
discount_percent         float64
discounted                 int64
item_total               float64
sale_date                 object
channel                   object
channel_campaigns         object
discount_percent_calc    float64
is_price_outlier            bool
is_quantity_outlier         bool
is_discount_outlier         bool
dtype: object

In [33]:
salesitems.dtypes

item_id                    int64
sale_id                    int64
product_id                 int64
quantity                   int64
original_price           float64
unit_price               float64
discount_applied         float64
discount_percent         float64
discounted                 int64
item_total               float64
sale_date                 object
channel                   object
channel_campaigns         object
discount_percent_calc    float64
is_price_outlier            bool
is_quantity_outlier         bool
is_discount_outlier         bool
dtype: object

In [34]:
salesitems["sale_date"] = pd.to_datetime(
    salesitems["sale_date"], 
    errors="coerce"   # convierte valores no válidos en NaT (Not a Time)
)

In [35]:
print(salesitems["sale_date"].dtypes)
salesitems["sale_date"].head()

datetime64[ns]


0   2025-06-16
1   2025-06-17
2   2025-04-16
3   2025-05-06
4   2025-06-15
Name: sale_date, dtype: datetime64[ns]

In [36]:
salesitems["year"] = salesitems["sale_date"].dt.year
salesitems["month"] = salesitems["sale_date"].dt.month
salesitems["year_month"] = salesitems["sale_date"].dt.to_period("M")

In [37]:
salesitems["sale_date"] = pd.to_datetime(
    salesitems["sale_date"], 
    format="%d/%m/%Y", 
    errors="coerce"
)

In [38]:
salesitems.dtypes

item_id                           int64
sale_id                           int64
product_id                        int64
quantity                          int64
original_price                  float64
unit_price                      float64
discount_applied                float64
discount_percent                float64
discounted                        int64
item_total                      float64
sale_date                datetime64[ns]
channel                          object
channel_campaigns                object
discount_percent_calc           float64
is_price_outlier                   bool
is_quantity_outlier                bool
is_discount_outlier                bool
year                              int32
month                             int32
year_month                    period[M]
dtype: object

In [39]:
salesitems["line_total"] = salesitems["unit_price"] * salesitems["quantity"]


In [40]:
salesitems["year_month"] = salesitems["sale_date"].dt.to_period("M")

In [41]:
monthly_sales = (
    salesitems.groupby("year_month")["line_total"]
    .sum()
    .reset_index()
    .sort_values("year_month")
)
monthly_sales

Unnamed: 0,year_month,line_total
0,2025-04,132588.01
1,2025-05,143471.57
2,2025-06,48177.29


In [42]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
plt.plot(monthly_sales["year_month"].astype(str), monthly_sales["line_total"], marker="o")
plt.title("Monthly Sales Trend")
plt.xlabel("Month")
plt.ylabel("Total Sales (€)")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [43]:
best_month = monthly_sales.loc[monthly_sales["line_total"].idxmax()]
print(f"Best month: {best_month['year_month']} with total sales of {best_month['line_total']:.2f}")

Best month: 2025-05 with total sales of 143471.57


In [44]:
# Ver cuántos países diferentes hay
customers["country"].nunique()

NameError: name 'customers' is not defined

In [45]:
sales["customer_country"].nunique()

NameError: name 'sales' is not defined

In [46]:
customers["country"].value_counts()

NameError: name 'customers' is not defined

In [47]:

import pandas as pd

# Cargar el dataset de customers
customers = pd.read_csv("dataset_fashion_store_customers.csv")

# Verificar que se cargó bien
customers.head()

Unnamed: 0,customer_id;country;age_range;signup_date
0,1;France;56-65;24/4/25
1,2;France;36-45;24/2/25
2,3;Netherlands;46-55;12/4/24
3,4;Italy;36-45;11/3/25
4,5;Spain;26-35;26/4/25


In [48]:
# Ver cuántos países distintos hay
customers["country"].nunique()

# Ver el recuento por país
customers["country"].value_counts()

KeyError: 'country'

In [49]:
import pandas as pd

customers = pd.read_csv(
    "dataset_fashion_store_customers.csv",
    sep=";",                # <— clave
    parse_dates=["signup_date"],  # si la fecha viene como 24/4/25
    dayfirst=True           # día/mes/año
)

customers.head()
customers.columns

  customers = pd.read_csv(


Index(['customer_id', 'country', 'age_range', 'signup_date'], dtype='object')

In [50]:
customers["country"] = (
    customers["country"]
    .astype(str).str.strip().str.title()  # " spain " -> "Spain"
)

In [51]:
num_countries = customers["country"].nunique()
country_counts = customers["country"].value_counts()

print("Unique countries:", num_countries)
country_counts.head(10)

Unique countries: 6


country
France         221
Germany        212
Italy          192
Netherlands    162
Spain          143
Portugal        70
Name: count, dtype: int64

In [52]:
salesitems.dtypes

item_id                           int64
sale_id                           int64
product_id                        int64
quantity                          int64
original_price                  float64
unit_price                      float64
discount_applied                float64
discount_percent                float64
discounted                        int64
item_total                      float64
sale_date                datetime64[ns]
channel                          object
channel_campaigns                object
discount_percent_calc           float64
is_price_outlier                   bool
is_quantity_outlier                bool
is_discount_outlier                bool
year                              int32
month                             int32
year_month                    period[M]
line_total                      float64
dtype: object

In [1]:
import pandas as pd

# Supongamos que tu DataFrame se llama df y la columna de precios es 'unit_price'
p01 = df['unit_price'].quantile(0.01)
p99 = df['unit_price'].quantile(0.99)

print("P01:", p01)
print("P99:", p99)

NameError: name 'df' is not defined

In [2]:
outliers = df[(df['unit_price'] < p01) | (df['unit_price'] > p99)]
print(len(outliers))
outliers.head(10)   # Para ver los primeros 10

NameError: name 'df' is not defined

In [3]:
# Calcular percentiles 1% y 99%
p01 = salesitems['unit_price'].quantile(0.01)
p99 = salesitems['unit_price'].quantile(0.99)

print("P01:", p01)
print("P99:", p99)

NameError: name 'salesitems' is not defined

In [4]:
import pandas as pd

# Carga tu dataset limpio (el que exportaste en la parte de wrangling)
salesitems = pd.read_csv('master_clean.csv')

# Verifica que está cargado
salesitems.head()

Unnamed: 0,item_id,sale_id,product_id,sale_date,year,month,year_month,quantity,unit_price,original_price,...,discount_percent,discount_percent_calc,discount_applied,discounted,channel,channel_campaigns,category,country,age_range,signup_date
0,2967,515,137,2025-05-20,2025,5,2025-05,5,46.52,46.52,...,,0.0,0.0,0,E-commerce,Website Banner,Pants,GERMANY,46-55,
1,919,239,200,2025-04-20,2025,4,2025-04,5,46.41,46.41,...,,0.0,0.0,0,E-commerce,Website Banner,Sleepwear,FRANCE,36-45,
2,1642,883,261,2025-05-21,2025,5,2025-05,5,46.1,46.1,...,,0.0,0.0,0,App Mobile,App Mobile,T-Shirts,PORTUGAL,46-55,
3,1674,665,62,2025-04-13,2025,4,2025-04,5,46.09,46.09,...,,0.0,0.0,0,E-commerce,Website Banner,Pants,SPAIN,16-25,
4,159,1220,155,2025-05-21,2025,5,2025-05,5,46.07,46.07,...,,0.0,0.0,0,E-commerce,Website Banner,Dresses,FRANCE,46-55,


In [5]:
p01 = salesitems['unit_price'].quantile(0.01)
p99 = salesitems['unit_price'].quantile(0.99)

print("P01:", p01)
print("P99:", p99)

P01: 22.5428
P99: 77.01039999999995


In [6]:
outliers = salesitems[(salesitems['unit_price'] < p01) | (salesitems['unit_price'] > p99)]
print("Número de outliers:", len(outliers))
outliers.head(10)

Número de outliers: 44


Unnamed: 0,item_id,sale_id,product_id,sale_date,year,month,year_month,quantity,unit_price,original_price,...,discount_percent,discount_percent_calc,discount_applied,discounted,channel,channel_campaigns,category,country,age_range,signup_date
192,1938,297,401,2025-05-17,2025,5,2025-05,5,21.58,30.83,...,,30.003244,9.25,1,App Mobile,App Mobile,Dresses,ITALY,56-65,
193,1104,1270,114,2025-04-24,2025,4,2025-04,5,21.53,21.53,...,,0.0,0.0,0,App Mobile,App Mobile,Shoes,FRANCE,46-55,
194,3296,189,114,2025-04-25,2025,4,2025-04,5,21.53,21.53,...,,0.0,0.0,0,E-commerce,Website Banner,Shoes,NETHERLANDS,56-65,
195,2412,1014,355,2025-05-16,2025,5,2025-05,5,21.38,21.38,...,,0.0,0.0,0,E-commerce,Website Banner,Pants,GERMANY,16-25,
196,516,1300,334,2025-04-15,2025,4,2025-04,5,20.82,20.82,...,,0.0,0.0,0,E-commerce,Website Banner,Dresses,GERMANY,36-45,
197,1240,727,334,2025-04-22,2025,4,2025-04,5,20.82,20.82,...,,0.0,0.0,0,E-commerce,Website Banner,Dresses,NETHERLANDS,26-35,
198,2394,172,246,2025-05-13,2025,5,2025-05,5,20.29,28.98,...,,29.986197,8.69,1,App Mobile,App Mobile,Dresses,SPAIN,16-25,
199,1273,331,98,2025-05-06,2025,5,2025-05,1,78.52,78.52,...,,0.0,0.0,0,App Mobile,App Mobile,Shoes,ITALY,26-35,
200,1829,1079,98,2025-06-15,2025,6,2025-06,1,78.52,78.52,...,,0.0,0.0,0,App Mobile,App Mobile,Shoes,GERMANY,46-55,
201,3221,478,98,2025-04-14,2025,4,2025-04,1,78.52,78.52,...,,0.0,0.0,0,App Mobile,App Mobile,Shoes,FRANCE,16-25,


In [7]:
outliers = salesitems[(salesitems['unit_price'] < p01) | (salesitems['unit_price'] > p99)]
outliers.sort_values('unit_price')

Unnamed: 0,item_id,sale_id,product_id,sale_date,year,month,year_month,quantity,unit_price,original_price,...,discount_percent,discount_percent_calc,discount_applied,discounted,channel,channel_campaigns,category,country,age_range,signup_date
1108,3306,1173,246,2025-05-16,2025,5,2025-05,2,20.29,28.98,...,,29.986197,8.69,1,App Mobile,Social Media,Dresses,SPAIN,56-65,
198,2394,172,246,2025-05-13,2025,5,2025-05,5,20.29,28.98,...,,29.986197,8.69,1,App Mobile,App Mobile,Dresses,SPAIN,16-25,
640,362,868,334,2025-05-24,2025,5,2025-05,1,20.82,20.82,...,,0.0,0.0,0,App Mobile,App Mobile,Dresses,SPAIN,56-65,
196,516,1300,334,2025-04-15,2025,4,2025-04,5,20.82,20.82,...,,0.0,0.0,0,E-commerce,Website Banner,Dresses,GERMANY,36-45,
197,1240,727,334,2025-04-22,2025,4,2025-04,5,20.82,20.82,...,,0.0,0.0,0,E-commerce,Website Banner,Dresses,NETHERLANDS,26-35,
1953,3215,660,334,2025-06-14,2025,6,2025-06,4,20.82,20.82,...,,0.0,0.0,0,App Mobile,App Mobile,Dresses,NETHERLANDS,46-55,
1952,1886,668,334,2025-04-20,2025,4,2025-04,4,20.82,20.82,...,,0.0,0.0,0,E-commerce,Website Banner,Dresses,FRANCE,46-55,
1106,652,819,355,2025-04-26,2025,4,2025-04,2,21.38,21.38,...,,0.0,0.0,0,E-commerce,Website Banner,Pants,PORTUGAL,36-45,
1507,2420,491,355,2025-05-23,2025,5,2025-05,3,21.38,21.38,...,,0.0,0.0,0,E-commerce,Website Banner,Pants,NETHERLANDS,56-65,
1951,1232,818,355,2025-06-16,2025,6,2025-06,4,21.38,21.38,...,,0.0,0.0,0,E-commerce,Website Banner,Pants,FRANCE,16-25,


In [8]:
def outlier_mask(s, low=0.01, high=0.99):
    ql, qh = s.quantile(low), s.quantile(high)
    return (s < ql) | (s > qh)

In [9]:
salesitems['unit_price']

0       46.52
1       46.41
2       46.10
3       46.09
4       46.07
        ...  
2188    46.90
2189    46.89
2190    46.89
2191    46.89
2192    46.72
Name: unit_price, Length: 2193, dtype: float64

In [10]:
print("Número de filas:", len(salesitems))
print("P01:", salesitems['unit_price'].quantile(0.01))
print("P99:", salesitems['unit_price'].quantile(0.99))

outliers = salesitems[
    (salesitems['unit_price'] < salesitems['unit_price'].quantile(0.01)) |
    (salesitems['unit_price'] > salesitems['unit_price'].quantile(0.99))
]

print("Outliers:", len(outliers))

Número de filas: 2193
P01: 22.5428
P99: 77.01039999999995
Outliers: 44


In [11]:
# Suponiendo que guardaste el DataFrame original en 'salesitems_original'
p01_orig = salesitems_original['unit_price'].quantile(0.01)
p99_orig = salesitems_original['unit_price'].quantile(0.99)
outliers_orig = salesitems_original[
    (salesitems_original['unit_price'] < p01_orig) |
    (salesitems_original['unit_price'] > p99_orig)
]

p01_new = salesitems['unit_price'].quantile(0.01)
p99_new = salesitems['unit_price'].quantile(0.99)
outliers_new = salesitems[
    (salesitems['unit_price'] < p01_new) |
    (salesitems['unit_price'] > p99_new)
]

# Comparar IDs (por ejemplo item_id o sale_id)
diff = outliers_orig[~outliers_orig['item_id'].isin(outliers_new['item_id'])]
print(diff)

NameError: name 'salesitems_original' is not defined

In [12]:
p01_disc = salesitems['discount_percent_calc'].quantile(0.01)
p99_disc = salesitems['discount_percent_calc'].quantile(0.99)

print("P01 discount:", p01_disc)
print("P99 discount:", p99_disc)

P01 discount: 0.0
P99 discount: 30.00252527591848


In [13]:
discount_outliers = salesitems[
    (salesitems['discount_percent_calc'] < p01_disc) |
    (salesitems['discount_percent_calc'] > p99_disc)
]

print("Número de discount outliers:", len(discount_outliers))
discount_outliers.head(10)  # Ver los primeros 10

Número de discount outliers: 22


Unnamed: 0,item_id,sale_id,product_id,sale_date,year,month,year_month,quantity,unit_price,original_price,...,discount_percent,discount_percent_calc,discount_applied,discounted,channel,channel_campaigns,category,country,age_range,signup_date
50,1771,821,291,2025-05-14,2025,5,2025-05,5,41.69,59.56,...,,30.003358,17.87,1,App Mobile,App Mobile,Shoes,FRANCE,26-35,
60,2595,81,321,2025-05-18,2025,5,2025-05,5,40.66,58.09,...,,30.005164,17.43,1,App Mobile,App Mobile,Pants,GERMANY,16-25,
121,2044,295,175,2025-05-13,2025,5,2025-05,5,34.41,49.16,...,,30.004068,14.75,1,App Mobile,App Mobile,T-Shirts,SPAIN,46-55,
192,1938,297,401,2025-05-17,2025,5,2025-05,5,21.58,30.83,...,,30.003244,9.25,1,App Mobile,App Mobile,Dresses,ITALY,56-65,
441,601,423,19,2025-05-13,2025,5,2025-05,1,45.68,65.26,...,,30.003065,19.58,1,App Mobile,Social Media,Dresses,PORTUGAL,36-45,
442,2347,1296,19,2025-05-18,2025,5,2025-05,1,45.68,65.26,...,,30.003065,19.58,1,App Mobile,Social Media,Dresses,SPAIN,56-65,
489,3108,356,291,2025-05-13,2025,5,2025-05,1,41.69,59.56,...,,30.003358,17.87,1,App Mobile,Social Media,Shoes,GERMANY,16-25,
502,94,558,22,2025-05-17,2025,5,2025-05,1,40.78,58.26,...,,30.003433,17.48,1,App Mobile,Social Media,T-Shirts,ITALY,16-25,
534,1363,199,107,2025-05-19,2025,5,2025-05,1,38.54,55.06,...,,30.003632,16.52,1,App Mobile,Social Media,Shoes,NETHERLANDS,26-35,
886,1194,588,239,2025-05-14,2025,5,2025-05,2,46.89,66.99,...,,30.004478,20.1,1,App Mobile,Social Media,Pants,ITALY,36-45,


In [1]:
import pandas as pd
import numpy as np

# 1) Cargar (ajusta el nombre si lo guardaste distinto)
df = pd.read_csv("master_clean.csv")

# Si sale_date no es datetime:
if not np.issubdtype(pd.Series(df["sale_date"]).dtype, np.datetime64):
    df["sale_date"] = pd.to_datetime(df["sale_date"], errors="coerce")

# 2) Helpers
def exists(col): 
    return col in df.columns

# 3) Básicos
rows = len(df)
min_date = df["sale_date"].min()
max_date = df["sale_date"].max()

num_products   = df["product_id"].nunique() if exists("product_id") else None
num_categories = df["category"].nunique()   if exists("category")   else None
num_countries  = df["country"].nunique()    if exists("country")    else None

# 4) Precios
unit_price_min = df["unit_price"].min()
unit_price_max = df["unit_price"].max()
unit_price_mean = df["unit_price"].mean()

# 5) Descuentos (usa discount_percent_calc si existe, si no discount_percent)
disc_col = "discount_percent_calc" if exists("discount_percent_calc") else ("discount_percent" if exists("discount_percent") else None)
if disc_col:
    disc_min  = df[disc_col].min()
    disc_max  = df[disc_col].max()
    disc_mean = df[disc_col].mean()
    # Outliers por percentiles 1% y 99%
    p01 = df[disc_col].quantile(0.01)
    p99 = df[disc_col].quantile(0.99)
    disc_outliers = df[(df[disc_col] < p01) | (df[disc_col] > p99)]
    disc_outliers_n = len(disc_outliers)
else:
    disc_min = disc_max = disc_mean = p01 = p99 = disc_outliers_n = None

# 6) Cantidades
qty_min = df["quantity"].min()
qty_max = df["quantity"].max()
qty_mean = df["quantity"].mean()

# 7) Revenue (si no está line_total, lo calculamos)
if "line_total" not in df.columns:
    # Si tienes original_price y discount_percent_calc, podrías refinar; aquí usamos unit_price * quantity
    df["line_total"] = df["unit_price"] * df["quantity"]

revenue_sum  = df["line_total"].sum()
revenue_mean = df["line_total"].mean()
revenue_max  = df["line_total"].max()

# 8) Top países (si existe)
top_countries = (df["country"].value_counts().head(5) if exists("country") else None)

# 9) Resultados
print("=== Dataset Overview ===")
print(f"Rows: {rows}")
print(f"Date range: {min_date.date() if pd.notna(min_date) else None} → {max_date.date() if pd.notna(max_date) else None}")
print(f"Unique products: {num_products}")
print(f"Unique categories: {num_categories}")
print(f"Unique countries: {num_countries}\n")

print("=== Unit Price ===")
print(f"min={unit_price_min:.2f} | max={unit_price_max:.2f} | mean={unit_price_mean:.2f}\n")

if disc_col:
    print(f"=== Discount ({disc_col}) ===")
    print(f"min={disc_min:.2f} | max={disc_max:.2f} | mean={disc_mean:.2f}")
    print(f"P01={p01:.2f} | P99={p99:.2f} | outliers={disc_outliers_n}\n")
else:
    print("No discount column found.\n")

print("=== Quantity ===")
print(f"min={qty_min} | max={qty_max} | mean={qty_mean:.2f}\n")

print("=== Revenue ===")
print(f"sum={revenue_sum:,.2f} | avg per line={revenue_mean:,.2f} | max line={revenue_max:,.2f}\n")

if top_countries is not None:
    print("=== Top Countries (by rows) ===")
    print(top_countries)

# 10) (Opcional) Exporta outliers de discount y precio para anexos
# Precio outliers 1–99%
p01_price = df["unit_price"].quantile(0.01)
p99_price = df["unit_price"].quantile(0.99)
price_outliers = df[(df["unit_price"] < p01_price) | (df["unit_price"] > p99_price)]
price_outliers.to_csv("outliers_unit_price.csv", index=False)
if disc_col:
    disc_outliers.to_csv("outliers_discount.csv", index=False)
print("\nExported: outliers_unit_price.csv", ("and outliers_discount.csv" if disc_col else ""))

=== Dataset Overview ===
Rows: 2193
Date range: 2025-04-04 → 2025-06-17
Unique products: 491
Unique categories: 5
Unique countries: 6

=== Unit Price ===
min=20.29 | max=80.00 | mean=48.22

=== Discount (discount_percent_calc) ===
min=0.00 | max=30.01 | mean=2.09
P01=0.00 | P99=30.00 | outliers=22

=== Quantity ===
min=1 | max=5 | mean=2.99

=== Revenue ===
sum=316,382.55 | avg per line=144.27 | max line=400.00

=== Top Countries (by rows) ===
country
GERMANY        522
FRANCE         477
ITALY          400
NETHERLANDS    324
SPAIN          271
Name: count, dtype: int64

Exported: outliers_unit_price.csv and outliers_discount.csv


In [2]:
products = pd.read_csv("dataset_fashion_store_products.csv")
sales = pd.read_csv("dataset_fashion_store_sales.csv")
salesitems = pd.read_csv("dataset_fashion_store_salesitems.csv")
customers = pd.read_csv("dataset_fashion_store_customers.csv")
channels = pd.read_csv("dataset_fashion_store_channels.csv")
campaigns = pd.read_csv("dataset_fashion_store_campaigns.csv")
stock = pd.read_csv("dataset_fashion_store_stock.csv")


In [3]:
# Ver número de productos y categorías
print(products['product_id'].nunique())
print(products['category'].nunique())

# Revisar descuentos en salesitems
print(salesitems['discount_percent'].describe())

# Revisar fechas en sales
sales['sale_date'] = pd.to_datetime(sales['sale_date'], errors='coerce')
print(sales['sale_date'].min(), sales['sale_date'].max())

KeyError: 'product_id'

In [4]:
import pandas as pd

products = pd.read_csv("dataset_fashion_store_products.csv")
print(products.head())   # para ver las primeras filas
print(products.columns)  # para ver los nombres exactos de columnas

  product_id;product_name;category;brand;color;size;catalog_price;cost_price;gender
0  1;Soft Wrap Dress;Dresses;Tiva;Green;S;40.41;2...                               
1  2;Soft Wrap Tee;T-Shirts;Tiva;White;S;78.45;53...                               
2  3;Soft Linen Tee;T-Shirts;Tiva;Green;XL;23.9;1...                               
3  4;Soft Ribbed Tee;T-Shirts;Tiva;White;S;60;34....                               
4  5;Soft Wrap Trousers;Pants;Tiva;Blue;M;36.84;1...                               
Index(['product_id;product_name;category;brand;color;size;catalog_price;cost_price;gender'], dtype='object')


In [5]:
products['Product_ID'].nunique()

KeyError: 'Product_ID'

In [6]:
products['product_id'].nunique()

KeyError: 'product_id'

In [7]:
print(products.columns.tolist())

['product_id;product_name;category;brand;color;size;catalog_price;cost_price;gender']


In [8]:
products['product_id'].nunique()

KeyError: 'product_id'

In [9]:
import pandas as pd

# Vuelve a cargar el CSV con el separador correcto
products = pd.read_csv(
    "dataset_fashion_store_products.csv",
    sep=";",            # <— clave
    encoding="utf-8",   # o "utf-8-sig" si ves caracteres raros
)

# Limpia posibles espacios y estandariza nombres (opcional)
products.columns = products.columns.str.strip()

# Comprueba
print(products.columns.tolist())
print(products.head())

['product_id', 'product_name', 'category', 'brand', 'color', 'size', 'catalog_price', 'cost_price', 'gender']
   product_id        product_name  category brand  color size  catalog_price  \
0           1     Soft Wrap Dress   Dresses  Tiva  Green    S          40.41   
1           2       Soft Wrap Tee  T-Shirts  Tiva  White    S          78.45   
2           3      Soft Linen Tee  T-Shirts  Tiva  Green   XL          23.90   
3           4     Soft Ribbed Tee  T-Shirts  Tiva  White    S          60.00   
4           5  Soft Wrap Trousers     Pants  Tiva   Blue    M          36.84   

   cost_price  gender  
0       20.70  Female  
1       53.76  Female  
2       14.81  Female  
3       34.78  Female  
4       16.46  Female  


In [10]:
products['product_id'].nunique()
products['category'].nunique()
products['category'].value_counts()

category
Dresses      109
T-Shirts     108
Sleepwear    104
Shoes        100
Pants         79
Name: count, dtype: int64

In [11]:
import pandas as pd
import numpy as np

# ---------- 1) Cargar todos los CSV (auto-detecta separador) ----------
def read_auto(path):
    return pd.read_csv(path, sep=None, engine="python")

products   = read_auto("dataset_fashion_store_products.csv")
sales      = read_auto("dataset_fashion_store_sales.csv")
salesitems = read_auto("dataset_fashion_store_salesitems.csv")
customers  = read_auto("dataset_fashion_store_customers.csv")
channels   = read_auto("dataset_fashion_store_channels.csv")
campaigns  = read_auto("dataset_fashion_store_campaigns.csv")
stock      = read_auto("dataset_fashion_store_stock.csv")

# Limpia nombres de columnas (espacios, etc.)
for df in [products, sales, salesitems, customers, channels, campaigns, stock]:
    df.columns = df.columns.str.strip()

# ---------- 2) Normalizar tipos mínimos útiles ----------
# Fechas
for df, col, fmt in [
    (salesitems, "sale_date", None),
    (sales,      "sale_date", None),
    (customers,  "signup_date", None),
]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

# Numéricos en salesitems
for col in ["unit_price", "original_price", "discount_applied", "discount_percent", "quantity", "item_total"]:
    if col in salesitems.columns:
        salesitems[col] = pd.to_numeric(salesitems[col], errors="coerce")

# Si no existe el descuento calculado, créalo
if "discount_percent_calc" not in salesitems.columns:
    salesitems["discount_percent_calc"] = np.where(
        salesitems["original_price"].gt(0),
        (salesitems["original_price"] - salesitems["unit_price"]) / salesitems["original_price"] * 100,
        np.nan
    )

# line_total (por si no existe o viene mal)
if "line_total" not in salesitems.columns:
    salesitems["line_total"] = salesitems["unit_price"] * salesitems["quantity"]

# ---------- 3) Métricas por tabla ----------
def exists(df, col): 
    return col in df.columns

print("=== PRODUCTS ===")
if exists(products, "product_id"):
    print("Unique products:", products["product_id"].nunique())
if exists(products, "category"):
    print("Unique categories:", products["category"].nunique())
    print("Top categories:\n", products["category"].value_counts().head(5), "\n")

print("=== CUSTOMERS ===")
if exists(customers, "country"):
    print("Unique countries:", customers["country"].nunique())
    print("Top countries (customers):\n", customers["country"].value_counts().head(5), "\n")

print("=== SALES (headers) ===")
if exists(sales, "sale_date"):
    print("Sales date range:", sales["sale_date"].min(), "→", sales["sale_date"].max(), "\n")

print("=== SALESITEMS — Mathematical Checks ===")
# Rango temporal desde salesitems si existe
if exists(salesitems, "sale_date"):
    print("Items date range:", salesitems["sale_date"].min(), "→", salesitems["sale_date"].max())

# Unit price
if exists(salesitems, "unit_price"):
    print("Unit price  min/mean/max:", 
          round(salesitems["unit_price"].min(),2),
          round(salesitems["unit_price"].mean(),2),
          round(salesitems["unit_price"].max(),2))

# Quantity
if exists(salesitems, "quantity"):
    print("Quantity     min/mean/max:", 
          salesitems["quantity"].min(),
          round(salesitems["quantity"].mean(),2),
          salesitems["quantity"].max())

# Discount (calc si está, si no raw)
disc_col = "discount_percent_calc" if exists(salesitems, "discount_percent_calc") else ("discount_percent" if exists(salesitems, "discount_percent") else None)
if disc_col:
    print(f"{disc_col}  min/mean/max:",
          round(salesitems[disc_col].min(skipna=True),2),
          round(salesitems[disc_col].mean(skipna=True),2),
          round(salesitems[disc_col].max(skipna=True),2))

# Revenue
if exists(salesitems, "line_total"):
    print("Revenue  sum/avg/max:",
          f"{salesitems['line_total'].sum():,.2f}",
          f"{salesitems['line_total'].mean():,.2f}",
          f"{salesitems['line_total'].max():,.2f}")

print()

# ---------- 4) Outliers por percentiles (salesitems) ----------
# Precios
if exists(salesitems, "unit_price"):
    p01_price = salesitems["unit_price"].quantile(0.01)
    p99_price = salesitems["unit_price"].quantile(0.99)
    price_outliers = salesitems[(salesitems["unit_price"] < p01_price) | (salesitems["unit_price"] > p99_price)]
    print(f"Unit price outliers (1–99%): {len(price_outliers)}  | P01={p01_price:.2f}  P99={p99_price:.2f}")
    price_outliers.to_csv("outliers_unit_price.csv", index=False)

# Descuentos
if disc_col:
    p01_disc = salesitems[disc_col].quantile(0.01)
    p99_disc = salesitems[disc_col].quantile(0.99)
    disc_outliers = salesitems[(salesitems[disc_col] < p01_disc) | (salesitems[disc_col] > p99_disc)]
    print(f"{disc_col} outliers (1–99%): {len(disc_outliers)}  | P01={p01_disc:.2f}  P99={p99_disc:.2f}")
    disc_outliers.to_csv("outliers_discount.csv", index=False)

print("\nExported: outliers_unit_price.csv", ("and outliers_discount.csv" if disc_col else ""))

=== PRODUCTS ===
Unique products: 500
Unique categories: 5
Top categories:
 category
Dresses      109
T-Shirts     108
Sleepwear    104
Shoes        100
Pants         79
Name: count, dtype: int64 

=== CUSTOMERS ===
Unique countries: 6
Top countries (customers):
 country
France         221
Germany        212
Italy          192
Netherlands    162
Spain          143
Name: count, dtype: int64 

=== SALES (headers) ===
Sales date range: 2025-04-04 00:00:00 → 2025-06-17 00:00:00 

=== SALESITEMS — Mathematical Checks ===
Items date range: 2025-04-04 00:00:00 → 2025-06-17 00:00:00
Unit price  min/mean/max: 11.68 48.14 85.9
Quantity     min/mean/max: 1 2.98 5
discount_percent_calc  min/mean/max: 0.0 2.4 30.02
Revenue  sum/avg/max: 324,236.87 143.91 403.80

Unit price outliers (1–99%): 46  | P01=20.57  P99=79.84
discount_percent_calc outliers (1–99%): 23  | P01=0.00  P99=30.01

Exported: outliers_unit_price.csv and outliers_discount.csv


  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")
  df[col] = pd.to_datetime(df[col], errors="coerce")


In [12]:
import pandas as pd
import numpy as np

# ---------- 1) Cargar master_clean ----------
df = pd.read_csv("master_clean.csv", sep=None, engine="python")

# Limpia columnas por si tienen espacios
df.columns = df.columns.str.strip()

# ---------- 2) Parsear fechas ----------
if "sale_date" in df.columns:
    df["sale_date"] = pd.to_datetime(df["sale_date"], errors="coerce")

# ---------- 3) Helpers ----------
def exists(col): 
    return col in df.columns

# ---------- 4) Dataset overview ----------
rows = len(df)
min_date = df["sale_date"].min() if exists("sale_date") else None
max_date = df["sale_date"].max() if exists("sale_date") else None

num_products   = df["product_id"].nunique() if exists("product_id") else None
num_categories = df["category"].nunique()   if exists("category") else None
num_countries  = df["country"].nunique()    if exists("country") else None

print("=== DATASET OVERVIEW (master_clean) ===")
print(f"Rows: {rows}")
print(f"Date range: {min_date.date() if pd.notna(min_date) else None} → {max_date.date() if pd.notna(max_date) else None}")
print(f"Unique products: {num_products}")
print(f"Unique categories: {num_categories}")
print(f"Unique countries: {num_countries}\n")

# ---------- 5) Unit price ----------
if exists("unit_price"):
    unit_price_min = df["unit_price"].min()
    unit_price_max = df["unit_price"].max()
    unit_price_mean = df["unit_price"].mean()
    print("=== UNIT PRICE ===")
    print(f"min={unit_price_min:.2f} | max={unit_price_max:.2f} | mean={unit_price_mean:.2f}\n")

# ---------- 6) Discount ----------
disc_col = "discount_percent_calc" if exists("discount_percent_calc") else ("discount_percent" if exists("discount_percent") else None)
if disc_col:
    disc_min  = df[disc_col].min()
    disc_max  = df[disc_col].max()
    disc_mean = df[disc_col].mean()
    p01_disc = df[disc_col].quantile(0.01)
    p99_disc = df[disc_col].quantile(0.99)
    disc_outliers = df[(df[disc_col] < p01_disc) | (df[disc_col] > p99_disc)]
    disc_outliers_n = len(disc_outliers)

    print(f"=== DISCOUNT ({disc_col}) ===")
    print(f"min={disc_min:.2f} | max={disc_max:.2f} | mean={disc_mean:.2f}")
    print(f"P01={p01_disc:.2f} | P99={p99_disc:.2f} | outliers={disc_outliers_n}\n")
else:
    print("No discount column found.\n")

# ---------- 7) Quantity ----------
if exists("quantity"):
    qty_min = df["quantity"].min()
    qty_max = df["quantity"].max()
    qty_mean = df["quantity"].mean()
    print("=== QUANTITY ===")
    print(f"min={qty_min} | max={qty_max} | mean={qty_mean:.2f}\n")

# ---------- 8) Revenue ----------
if exists("line_total"):
    revenue_sum  = df["line_total"].sum()
    revenue_mean = df["line_total"].mean()
    revenue_max  = df["line_total"].max()
    print("=== REVENUE ===")
    print(f"sum={revenue_sum:,.2f} | avg per line={revenue_mean:,.2f} | max line={revenue_max:,.2f}\n")

# ---------- 9) Top Countries ----------
if exists("country"):
    top_countries = df["country"].value_counts().head(5)
    print("=== TOP COUNTRIES (by rows) ===")
    print(top_countries)
    print()

# ---------- 10) Outliers Export ----------
# Unit Price outliers
if exists("unit_price"):
    p01_price = df["unit_price"].quantile(0.01)
    p99_price = df["unit_price"].quantile(0.99)
    price_outliers = df[(df["unit_price"] < p01_price) | (df["unit_price"] > p99_price)]
    price_outliers.to_csv("outliers_unit_price_master.csv", index=False)
    print(f"Exported unit price outliers: {len(price_outliers)} rows")

# Discount outliers
if disc_col:
    disc_outliers.to_csv("outliers_discount_master.csv", index=False)
    print(f"Exported discount outliers: {len(disc_outliers)} rows")

print("\n✅ Done — Data checks + outliers exported from master_clean.csv")

=== DATASET OVERVIEW (master_clean) ===
Rows: 2193
Date range: 2025-04-04 → 2025-06-17
Unique products: 491
Unique categories: 5
Unique countries: 6

=== UNIT PRICE ===
min=20.29 | max=80.00 | mean=48.22

=== DISCOUNT (discount_percent_calc) ===
min=0.00 | max=30.01 | mean=2.09
P01=0.00 | P99=30.00 | outliers=22

=== QUANTITY ===
min=1 | max=5 | mean=2.99

=== REVENUE ===
sum=316,382.55 | avg per line=144.27 | max line=400.00

=== TOP COUNTRIES (by rows) ===
country
GERMANY        522
FRANCE         477
ITALY          400
NETHERLANDS    324
SPAIN          271
Name: count, dtype: int64

Exported unit price outliers: 44 rows
Exported discount outliers: 22 rows

✅ Done — Data checks + outliers exported from master_clean.csv
