## Imports

In [2]:
import numpy as np
import pandas as pd
from helpers import inspect_categories, classify_age

## 1. Carga de datos

In [23]:

df = pd.read_excel("../datasets/bikesales.xlsx")

### 1.1 Normalizamos datos para facilitar exploración


In [4]:
# Convertimos a tipos modernos
df = df.convert_dtypes()
# Normalizamos nombres columnas
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
      .str.replace(r"[^0-9a-zA-Z_]+", "", regex=True)
      .str.replace(r"_+$", "", regex=True)          
)

## 2. EDA inicial

### 2.1 información general

In [5]:
df.info() # estructura general

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   sales_order          89 non-null     Int64         
 1   date                 89 non-null     datetime64[ns]
 2   day                  88 non-null     Int64         
 3   month                89 non-null     string        
 4   year                 89 non-null     Int64         
 5   customer_age         89 non-null     Int64         
 6   age_group            88 non-null     string        
 7   customer_gender      89 non-null     string        
 8   country              89 non-null     string        
 9   state                89 non-null     string        
 10  product_category     89 non-null     string        
 11  sub_category         89 non-null     string        
 12  product_description  88 non-null     string        
 13  order_quantity       88 non-null     

In [6]:
(df["day"] == df["date"].dt.day).sum()

np.int64(88)

In [7]:
df.shape # forma

(89, 19)

In [8]:
df.columns # Columnas

Index(['sales_order', 'date', 'day', 'month', 'year', 'customer_age',
       'age_group', 'customer_gender', 'country', 'state', 'product_category',
       'sub_category', 'product_description', 'order_quantity', 'unit_cost',
       'unit_price', 'profit', 'cost', 'revenue'],
      dtype='object')

In [9]:
df.head() # primeras filas

Unnamed: 0,sales_order,date,day,month,year,customer_age,age_group,customer_gender,country,state,product_category,sub_category,product_description,order_quantity,unit_cost,unit_price,profit,cost,revenue
0,261695,2021-12-01,1,December,2021,39,Adults (35-64),F,United States,California,Bikes,Mountain Bikes,"Mountain-200 Black, 46",4,1252,2295,4172,5008,9180
1,261695,2021-12-01,1,December,2021,44,Adults (35-64),M,United Kingdom,England,Bikes,Mountain Bikes,"Mountain-200 Silver, 42",1,1266,2320,1054,1266,2320
2,261697,2021-12-02,2,December,2021,37,Adults (35-64),M,United States,California,Bikes,Mountain Bikes,"Mountain-400-W Silver, 46",2,420,769,698,840,1538
3,261698,2021-12-02,2,December,2021,31,Young Adults (25-34),F,Australia,New South Wales,Bikes,Mountain Bikes,"Mountain-400-W Silver, 42",1,420,769,349,420,769
4,261699,2021-12-03,3,December,2021,37,Adults (35-64),F,United States,California,Bikes,Mountain Bikes,"Mountain-200 Black, 46",2,0,2295,2086,0,4590


In [10]:
df.sample(5, random_state=0) # muestra aleatoria

Unnamed: 0,sales_order,date,day,month,year,customer_age,age_group,customer_gender,country,state,product_category,sub_category,product_description,order_quantity,unit_cost,unit_price,profit,cost,revenue
2,261697,2021-12-02,2,December,2021,37,Adults (35-64),M,United States,California,Bikes,Mountain Bikes,"Mountain-400-W Silver, 46",2,420,769,698,840,1538
13,261707,2021-12-06,6,December,2021,23,Youth (<25),M,United Kingdom,England,Bikes,Mountain Bikes,"Mountain-400-W Silver, 46",3,420,769,1047,1260,2307
53,261747,2021-12-17,17,December,2021,31,Young Adults (25-34),M,Australia,New South Wales,Bikes,Mountain Bikes,"Mountain-400-W Silver, 42",1,420,769,349,420,769
41,261735,2021-12-13,13,December,2021,32,Young Adults (25-34),F,Australia,Queensland,Bikes,Mountain Bikes,"Mountain-200 Silver, 42",3,1266,2320,3162,3798,6960
66,261760,2021-12-19,19,December,2021,37,Adults (35-64),M,United States,Oregon,Bikes,Mountain Bikes,"Mountain-200 Black, 38",4,1252,2295,4172,5008,9180


In [11]:
df.describe() # resumen estadistico


Unnamed: 0,sales_order,date,day,year,customer_age,order_quantity,unit_cost,unit_price,profit,cost,revenue
count,89.0,89,88.0,89.0,89.0,88.0,89.0,89.0,89.0,89.0,89.0
mean,261738.067416,2021-12-13 10:31:00.674157312,13.534091,2021.0,34.202247,2.125,1052.247191,1940.94382,1848.88764,2204.235955,4043.94382
min,261695.0,2021-12-01 00:00:00,1.0,2021.0,17.0,1.0,0.0,0.0,245.0,0.0,0.0
25%,261716.0,2021-12-08 00:00:00,8.75,2021.0,30.0,1.0,420.0,2295.0,1043.0,1252.0,2295.0
50%,261738.0,2021-12-13 00:00:00,13.5,2021.0,35.0,2.0,1252.0,2295.0,1054.0,1266.0,2320.0
75%,261760.0,2021-12-19 00:00:00,19.0,2021.0,38.0,3.0,1266.0,2320.0,2954.0,3756.0,6750.0
max,261782.0,2021-12-24 00:00:00,24.0,2021.0,63.0,4.0,1912.0,3400.0,5908.0,7592.0,13500.0
std,25.729713,,6.391353,0.0,8.090651,1.239322,440.499001,789.955743,1394.401162,1730.330926,3139.75858


### 2.2 Exploración de nulos y duplicados

In [12]:
df.isna().sum().sort_values(ascending=False) # conteo de nulos

day                    1
age_group              1
order_quantity         1
product_description    1
sales_order            0
sub_category           0
cost                   0
profit                 0
unit_price             0
unit_cost              0
state                  0
product_category       0
date                   0
country                0
customer_gender        0
customer_age           0
year                   0
month                  0
revenue                0
dtype: int64

In [13]:
df.duplicated().sum() # Filas duplicadas

np.int64(0)

In [14]:
df[df["sales_order"].duplicated(keep=False)] # números de orden duplicados

Unnamed: 0,sales_order,date,day,month,year,customer_age,age_group,customer_gender,country,state,product_category,sub_category,product_description,order_quantity,unit_cost,unit_price,profit,cost,revenue
0,261695,2021-12-01,1,December,2021,39,Adults (35-64),F,United States,California,Bikes,Mountain Bikes,"Mountain-200 Black, 46",4,1252,2295,4172,5008,9180
1,261695,2021-12-01,1,December,2021,44,Adults (35-64),M,United Kingdom,England,Bikes,Mountain Bikes,"Mountain-200 Silver, 42",1,1266,2320,1054,1266,2320
6,261701,2021-12-03,3,December,2021,37,Adults (35-64),M,United States,Washington,Bikes,Mountain Bikes,"Mountain-200 Black, 46",1,1252,2295,1043,1252,2295
7,261701,2021-12-03,3,December,2021,37,Adults (35-64),M,United States,Washington,Bikes,Mountain Bikes,"Mountain-200 Black, 46",1,1252,2295,1043,1252,2295


### 2.3 Exploración de categoricos

In [15]:
cat_cols = df.select_dtypes(include="string").columns # Explorar categoricos

for col in cat_cols:
    print(f"\n=== {col} ===")
    vals = sorted(df[col].dropna().unique())
    print(vals[:30])


=== month ===
['December', 'Decmber']

=== age_group ===
['Adults (35-64)', 'Young Adults (25-34)', 'Youth (<25)']

=== customer_gender ===
['F', 'M']

=== country ===
[' United States', 'Australia', 'Canada', 'France', 'Germany', 'United  States', 'United Kingdom', 'United States', 'United States ']

=== state ===
['British Columbia', 'California', 'England', 'Hamburg', 'Hessen', 'New South Wales', 'Nord', 'Nordrhein-Westfalen', 'Oregon', 'Queensland', 'Seine (Paris)', 'Seine Saint Denis', 'Seine et Marne', 'Somme', 'South Australia', 'Victoria', 'Washington']

=== product_category ===
['Bikes']

=== sub_category ===
['Mountain Bikes']

=== product_description ===
['Mountain-100 Black, 38', 'Mountain-100 Black, 48', 'Mountain-100 Silver, 44', 'Mountain-200 Black, 38', 'Mountain-200 Black, 42', 'Mountain-200 Black, 46', 'Mountain-200 Silver, 38', 'Mountain-200 Silver, 42', 'Mountain-200 Silver, 46', 'Mountain-400-W Silver, 38', 'Mountain-400-W Silver, 42', 'Mountain-400-W Silver, 46',

In [16]:
inspect_categories(df, 'country') # Explorar categoricos

Hay inconsistencias entre original y clean (variantes sucias).
Grupos conflictivos: 1

Pares original/clean problemáticos:



Unnamed: 0,original,clean
0,United States,united states
2,United States,united states
6,United States,united states


Unnamed: 0,original,clean
3,Australia,australia
14,Canada,canada
31,France,france
10,Germany,germany
4,United States,united states
1,United Kingdom,united kingdom
0,United States,united states
2,United States,united states
6,United States,united states


In [17]:
inspect_categories(df, 'state') # Explorar categoricos

No se detectan inconsistencias aparentes.


Unnamed: 0,original,clean
14,British Columbia,british columbia
0,California,california
1,England,england
24,Hamburg,hamburg
70,Hessen,hessen
3,New South Wales,new south wales
62,Nord,nord
10,Nordrhein-Westfalen,nordrhein-westfalen
21,Oregon,oregon
11,Queensland,queensland


In [18]:
inspect_categories(df, "product_description") 

No se detectan inconsistencias aparentes.


Unnamed: 0,original,clean
23,"Mountain-100 Black, 38","mountain-100 black, 38"
65,"Mountain-100 Black, 48","mountain-100 black, 48"
29,"Mountain-100 Silver, 44","mountain-100 silver, 44"
5,"Mountain-200 Black, 38","mountain-200 black, 38"
15,"Mountain-200 Black, 42","mountain-200 black, 42"
0,"Mountain-200 Black, 46","mountain-200 black, 46"
11,"Mountain-200 Silver, 38","mountain-200 silver, 38"
1,"Mountain-200 Silver, 42","mountain-200 silver, 42"
51,"Mountain-200 Silver, 46","mountain-200 silver, 46"
17,"Mountain-400-W Silver, 38","mountain-400-w silver, 38"


### 2.4 Coherencia entre columnas

In [19]:
# Verificamos coherencia age_group
df["age_group_expected"] = df["customer_age"].apply(classify_age).astype("string") 
df["age_group_match"] = df["age_group"] == df["age_group_expected"]
df["age_group_match"].value_counts(dropna=False)

age_group_match
True    88
<NA>     1
Name: count, dtype: Int64

In [20]:
# Verificamos coherencia profit
df["profit_match"] = df["profit"] == (df["unit_price"]-df["unit_cost"])*df["order_quantity"]
df["profit_match"].value_counts(dropna=False)
df[df["profit_match"] == False]

Unnamed: 0,sales_order,date,day,month,year,customer_age,age_group,customer_gender,country,state,...,product_description,order_quantity,unit_cost,unit_price,profit,cost,revenue,age_group_expected,age_group_match,profit_match
4,261699,2021-12-03,3,December,2021,37,Adults (35-64),F,United States,California,...,"Mountain-200 Black, 46",2,0,2295,2086,0,4590,Adults (35-64),True,False
8,261702,2021-12-04,4,December,2021,31,Young Adults (25-34),F,Australia,New South Wales,...,"Mountain-400-W Silver, 42",4,420,0,1396,1680,0,Young Adults (25-34),True,False


In [21]:
# Verificamos coherencia cost
df["cost_match"] =df["cost"] == df["unit_cost"]*df["order_quantity"]
df["cost_match"].value_counts(dropna=False)

cost_match
True    88
<NA>     1
Name: count, dtype: Int64

In [22]:
df["revenue_match"] = df["revenue"] == df["unit_price"]*df["order_quantity"]
df["revenue_match"].value_counts(dropna=False)

revenue_match
True    88
<NA>     1
Name: count, dtype: Int64

## 3. Data quality issue log

#### 3.1 Columnas presentan ruido textual
#### 3.2 Columnas sin valor ananalítico
- Las columnas **day**, **month** y **year** derivan de date y son redundantes
- **product_category** y **sub_category** no aportan variabilidad analítica
#### 3.3 contienen valores nulos
- **day** 
- **age_group** 
- **order_quantity**
- **product_description**
#### 3.4 duplicados
- **sales_order** contiene una fila duplicada
- Hay dos transacciones distintas con un mismo **sales_order**
#### 3.5 Problemas categóricos
- **month** tiene un valor escrito incorrectamente.
- **country** contiene espacios y ruido textual.
#### 3.6 valores imposibles
- En una transacción figuran **cost** y **unit_cost** = 0
- En una transacción figuran **unit_price** y **revenue** = 0
