In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 100)

In [8]:
df_raw = pd.read_excel("./datasets/bikesales.xlsx")
df_clean = df_raw.copy()

## 1. Reglas de limpieza

### 1.1 Normalización de nombres de columnas (Log 3.1)

Acción:
Normalizar nombres de columnas a estilo snake_case.

### 1.2 Columnas sin valor analitico (Log 3.2)

Acción:
Eliminar columnas **day**, **month**, **year**, **product_category** y **sub_category**

### 1.3 Extraer atributos de product_description

Acción: 
La columna **product_description** contiene atributos de talla y color que conviene separar en **frame_size** y **product_color** para labores de business analytics

### 1.4 Valores nulos (log 3.3)

##### Nulos en age_group

Acción:
Reconstruir a partir de **customer_age**

##### Nulos en order_quantity

Acción: 
Reconstruir a partir de **unit_cost** y **cost**

##### Nulos en product_description

Acción: 
Reconstruir a partir de **unit_cost**

### 1.5 Valores duplicados (log 3.4)

Acción:
Crear un identificador único para cada fila con un mismo order_id

### 1.6 Problemas categóricos (log 3.5)

Acción: 
Normalizar valores categoricos aplicando str.strip(), str.lower(), colapso de espacios y elminización de caracteres no estándar

### 1.7 valores imposibles (log 3.6)

#### Registros con cost = 0 y unit_cost = 0

Acción: inferir el valor correcto como:

 **cost** = **revenue** - **profit** 

 **unit_cost** = **cost** /  **order quantity**

#### Registros con revenue = 0 y unit_price = 0

Acción: inferir el valor correcto como:

 **revenue** = **cost** + **profit** 

 **unit_price** = **revenue** / **order_quantity**

## Limpieza de datos

In [10]:
df_clean.columns = (
    df_clean.columns
    .str.strip()
    .str.lower()
    .str.replace(" ","_")
    .str.replace(r"[^0-9a-zA-Z_]+","", regex = True)
    .str.strip("_")
)
df_raw.columns = df_clean.columns

In [11]:
df_clean.drop("day", axis = 1, inplace = True)
df_clean.drop("month", axis = 1, inplace = True)
df_clean.drop("year", axis = 1, inplace = True)
df_clean.drop("product_category", axis = 1, inplace = True)
df_clean.drop("sub_category", axis = 1, inplace = True)
df_clean

Unnamed: 0,sales_order,date,customer_age,age_group,customer_gender,country,state,product_description,order_quantity,unit_cost,unit_price,profit,cost,revenue
0,261695,2021-12-01,39,Adults (35-64),F,United States,California,"Mountain-200 Black, 46",4.0,1252,2295,4172,5008,9180
1,261695,2021-12-01,44,Adults (35-64),M,United Kingdom,England,"Mountain-200 Silver, 42",1.0,1266,2320,1054,1266,2320
2,261697,2021-12-02,37,Adults (35-64),M,United States,California,"Mountain-400-W Silver, 46",2.0,420,769,698,840,1538
3,261698,2021-12-02,31,Young Adults (25-34),F,Australia,New South Wales,"Mountain-400-W Silver, 42",1.0,420,769,349,420,769
4,261699,2021-12-03,37,Adults (35-64),F,United States,California,"Mountain-200 Black, 46",2.0,0,2295,2086,0,4590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,261778,2021-12-22,41,Adults (35-64),M,Germany,Hessen,"Mountain-200 Silver, 38",1.0,1266,2320,1054,1266,2320
85,261779,2021-12-23,30,Young Adults (25-34),F,United States,Oregon,"Mountain-200 Silver, 42",1.0,1266,2320,1054,1266,2320
86,261780,2021-12-23,31,Young Adults (25-34),F,Canada,British Columbia,"Mountain-200 Black, 42",1.0,1252,2295,1043,1252,2295
87,261781,2021-12-23,35,Adults (35-64),F,United States,California,"Mountain-500 Black, 42",1.0,295,540,245,295,540


In [13]:
# Trabajaremos con las columnas originales

df_clean["frame_size"] = (
    df_raw.product_description
    .str.extract(r",\s*(\d+)$")
)

df_clean["product_color"] = (
    df_clean.product_description
    .str.extract(r"\s*([a-zA-Z]+)$")
)

df_clean["product_description"] = (
    df_clean.product_description
    .str.replace(r",\s*\d+$","", regex = True)
    .str.replace(r"\s*[a-zA-Z]+$","", regex = True)
)


In [14]:
df_clean

Unnamed: 0,sales_order,date,customer_age,age_group,customer_gender,country,state,product_description,order_quantity,unit_cost,unit_price,profit,cost,revenue,frame_size,product_color
0,261695,2021-12-01,39,Adults (35-64),F,United States,California,Mountain-200,4.0,1252,2295,4172,5008,9180,46,Black
1,261695,2021-12-01,44,Adults (35-64),M,United Kingdom,England,Mountain-200,1.0,1266,2320,1054,1266,2320,42,Silver
2,261697,2021-12-02,37,Adults (35-64),M,United States,California,Mountain-400-W,2.0,420,769,698,840,1538,46,Silver
3,261698,2021-12-02,31,Young Adults (25-34),F,Australia,New South Wales,Mountain-400-W,1.0,420,769,349,420,769,42,Silver
4,261699,2021-12-03,37,Adults (35-64),F,United States,California,Mountain-200,2.0,0,2295,2086,0,4590,46,Black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,261778,2021-12-22,41,Adults (35-64),M,Germany,Hessen,Mountain-200,1.0,1266,2320,1054,1266,2320,38,Silver
85,261779,2021-12-23,30,Young Adults (25-34),F,United States,Oregon,Mountain-200,1.0,1266,2320,1054,1266,2320,42,Silver
86,261780,2021-12-23,31,Young Adults (25-34),F,Canada,British Columbia,Mountain-200,1.0,1252,2295,1043,1252,2295,42,Black
87,261781,2021-12-23,35,Adults (35-64),F,United States,California,Mountain-500,1.0,295,540,245,295,540,42,Black
