W2.2_Normalizaci√≥n_de_Datos

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# üåç URL del dataset de casos confirmados de COVID-19 en el mundo
url_covid = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"

# üß© Cargando el archivo sin forzar tipos de datos
df_covid = pd.read_csv(url_covid)

# üîé Exploraci√≥n inicial del dataset
print("üîπ Dimensiones:", df_covid.shape)
print("üîπ Tipos de datos:\n", df_covid.dtypes)

# üëÄ Vista r√°pida del contenido
print("\nüîπ Primeras filas:")
display(df_covid.head())

# ‚ö†Ô∏è Revisi√≥n de valores faltantes
print("\nüîπ Nulos por columna:")
display(df_covid.isnull().sum())


üîπ Dimensiones: (289, 1147)
üîπ Tipos de datos:
 Province/State     object
Country/Region     object
Lat               float64
Long              float64
1/22/20             int64
                   ...   
3/5/23              int64
3/6/23              int64
3/7/23              int64
3/8/23              int64
3/9/23              int64
Length: 1147, dtype: object

üîπ Primeras filas:


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,209322,209340,209358,209362,209369,209390,209406,209436,209451,209451
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,334391,334408,334408,334427,334427,334427,334427,334427,334443,334457
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,271441,271448,271463,271469,271469,271477,271477,271490,271494,271496
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,47866,47875,47875,47875,47875,47875,47875,47875,47890,47890
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,105255,105277,105277,105277,105277,105277,105277,105277,105288,105288



üîπ Nulos por columna:


Unnamed: 0,0
Province/State,198
Country/Region,0
Lat,2
Long,2
1/22/20,0
...,...
3/5/23,0
3/6/23,0
3/7/23,0
3/8/23,0


In [3]:
# üè∑Ô∏è Renombrando columnas para que tengan nombres m√°s limpios y consistentes
df_covid.rename(columns={
    'Province/State': 'province_state',
    'Country/Region': 'country_region',
    'Lat': 'lat',
    'Long': 'long'
}, inplace=True)

# üëÅÔ∏è Mostrando una vista previa del resultado
display(df_covid.head())


Unnamed: 0,province_state,country_region,lat,long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,209322,209340,209358,209362,209369,209390,209406,209436,209451,209451
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,334391,334408,334408,334427,334427,334427,334427,334427,334443,334457
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,271441,271448,271463,271469,271469,271477,271477,271490,271494,271496
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,47866,47875,47875,47875,47875,47875,47875,47875,47890,47890
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,105255,105277,105277,105277,105277,105277,105277,105277,105288,105288


In [4]:
# üóìÔ∏è Detectando las columnas que representan fechas
date_cols = df_covid.columns[4:]

# üîÑ Reorganizando el dataset a un formato m√°s ordenado (de ancho a largo)
df_long = df_covid.melt(
    id_vars=['province_state', 'country_region', 'lat', 'long'],
    value_vars=date_cols,
    var_name='date',
    value_name='confirmed'
)

# ‚è∞ Transformando las fechas al formato de tiempo real
df_long['date'] = pd.to_datetime(df_long['date'], format='%m/%d/%y')

# üåü Confirmando que todo sali√≥ bien y mostrando una vista r√°pida
print("‚úÖ Formato largo generado correctamente:")
display(df_long.head())


‚úÖ Formato largo generado correctamente:


Unnamed: 0,province_state,country_region,lat,long,date,confirmed
0,,Afghanistan,33.93911,67.709953,2020-01-22,0
1,,Albania,41.1533,20.1683,2020-01-22,0
2,,Algeria,28.0339,1.6596,2020-01-22,0
3,,Andorra,42.5063,1.5218,2020-01-22,0
4,,Angola,-11.2027,17.8739,2020-01-22,0


In [5]:
# üß≠ Asegurando que las coordenadas sean valores num√©ricos v√°lidos
df_long['lat'] = df_long['lat'].astype(float)
df_long['long'] = df_long['long'].astype(float)

# üíâ Verificando los datos de casos confirmados
# üîπ Convertimos a n√∫meros enteros
# üîπ Eliminamos valores imposibles (negativos)
df_long['confirmed'] = pd.to_numeric(df_long['confirmed'], errors='coerce')
df_long.loc[df_long['confirmed'] < 0, 'confirmed'] = np.nan

# üåø Rellenamos los vac√≠os con ceros, solo cuando tiene sentido
df_long['confirmed'] = df_long['confirmed'].fillna(0)

# üßæ Revisando los tipos de datos finales
print(df_long.dtypes)


province_state            object
country_region            object
lat                      float64
long                     float64
date              datetime64[ns]
confirmed                float64
dtype: object


In [6]:
# Instalaci√≥n y carga de la librer√≠a pycountry para obtener c√≥digos ISO-3
!pip install pycountry
import pycountry

# Funci√≥n para convertir el nombre del pa√≠s en su c√≥digo ISO de tres letras
def get_iso3(country):
    try:
        return pycountry.countries.lookup(country).alpha_3
    except:
        return None

# Aplicar la funci√≥n a todos los registros del dataset
df_long['iso3'] = df_long['country_region'].apply(get_iso3)

# Mostrar algunos ejemplos de pa√≠ses con su respectivo c√≥digo ISO-3
display(df_long[['country_region', 'iso3']].drop_duplicates().head(10))



Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/6.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m4.8/6.3 MB[0m [31m143.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m [32m6.3/6.3 MB[0m [31m147.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.3/6.3 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
S

Unnamed: 0,country_region,iso3
0,Afghanistan,AFG
1,Albania,ALB
2,Algeria,DZA
3,Andorra,AND
4,Angola,AGO
5,Antarctica,ATA
6,Antigua and Barbuda,ATG
7,Argentina,ARG
8,Armenia,ARM
9,Australia,AUS


In [7]:
# Verificaci√≥n de coordenadas geogr√°ficas dentro del rango permitido
valid_lat = df_long['lat'].between(-90, 90)
valid_long = df_long['long'].between(-180, 180)

# Reporte de valores fuera de rango
print("Latitudes fuera de rango:", (~valid_lat).sum())
print("Longitudes fuera de rango:", (~valid_long).sum())

# Filtrado de registros inv√°lidos
df_long = df_long[valid_lat & valid_long]

# Confirmaci√≥n del resultado final
print("Registros finales:", df_long.shape)


Latitudes fuera de rango: 2286
Longitudes fuera de rango: 2286
Registros finales: (328041, 7)


In [8]:
# Cargar el dataset de Chipotle desde la fuente en l√≠nea
url_chipotle = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"

# Lectura del archivo con separador de tabulaciones
df_chip = pd.read_csv(url_chipotle, sep='\t')

# Exploraci√≥n inicial del dataset
print("Dimensiones del dataset:", df_chip.shape)
print("Tipos de datos de cada columna:")
print(df_chip.dtypes)

# Vista previa de las primeras filas
print("\nPrimeras filas del dataset:")
display(df_chip.head())

# Revisi√≥n de valores nulos por columna
print("\nValores nulos por columna:")
display(df_chip.isnull().sum())


Dimensiones del dataset: (4622, 5)
Tipos de datos de cada columna:
order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

Primeras filas del dataset:


Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98



Valores nulos por columna:


Unnamed: 0,0
order_id,0
quantity,0
item_name,0
choice_description,1246
item_price,0


In [None]:
# Limpieza de la columna 'item_price': eliminar el s√≠mbolo de d√≥lar y convertir a tipo num√©rico
df_chip['item_price_num'] = df_chip['item_price'].replace('[\$,]', '', regex=True).astype(float)

# Asegurar que 'quantity' sea un n√∫mero entero positivo
df_chip['quantity'] = df_chip['quantity'].astype(int)
df_chip = df_chip[df_chip['quantity'] > 0]

# Crear la columna 'unit_price' (equivalente al precio por unidad)
df_chip['unit_price'] = df_chip['item_price_num']

# Calcular el total por l√≠nea (precio unitario √ó cantidad)
df_chip['line_total'] = round(df_chip['unit_price'] * df_chip['quantity'], 2)

# Mostrar una vista previa de las columnas relevantes
display(df_chip[['item_name', 'item_price', 'quantity', 'line_total']].head())


In [None]:
# Estandarizar los nombres de los productos
# üîπ Convertir a min√∫sculas
# üîπ Eliminar espacios en blanco al inicio y al final
df_chip['item_name_norm'] = df_chip['item_name'].str.lower().str.strip()

# Mostrar algunos ejemplos de los nombres originales y normalizados
display(df_chip[['item_name', 'item_name_norm']].head())


In [10]:
# ==============================
# CARGA DEL DATASET
# ==============================
import pandas as pd
import numpy as np

url_chipotle = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"
df_chip = pd.read_csv(url_chipotle, sep='\t')

# Exploraci√≥n inicial
print("Dimensiones del dataset:", df_chip.shape)
print("Tipos de datos:")
print(df_chip.dtypes)
print("\nPrimeras filas:")
display(df_chip.head())
print("\nValores nulos por columna:")
display(df_chip.isnull().sum())

# ==============================
# LIMPIEZA DE PRECIOS Y CANTIDADES
# ==============================
# Limpiar item_price: eliminar $ y convertir a float
df_chip['item_price_num'] = df_chip['item_price'].replace('[\$,]', '', regex=True).astype(float)

# Asegurar que quantity sea un entero positivo
df_chip['quantity'] = df_chip['quantity'].astype(int)
df_chip = df_chip[df_chip['quantity'] > 0]

# Crear columna unit_price
df_chip['unit_price'] = df_chip['item_price_num']

# Calcular total por l√≠nea
df_chip['line_total'] = round(df_chip['unit_price'] * df_chip['quantity'], 2)

# Mostrar algunos resultados
display(df_chip[['item_name', 'item_price', 'quantity', 'line_total']].head())

# ==============================
# NORMALIZAR NOMBRES DE PRODUCTOS
# ==============================
df_chip['item_name_norm'] = df_chip['item_name'].str.lower().str.strip()
display(df_chip[['item_name', 'item_name_norm']].head())

# ==============================
# VALIDACIONES
# ==============================
# Precios negativos y cantidades inv√°lidas
print("Precios negativos:", (df_chip['unit_price'] < 0).sum())
print("Cantidades inv√°lidas:", (df_chip['quantity'] < 1).sum())

# Chequeo de coherencia de precios por producto
unit_price_stats = df_chip.groupby('item_name_norm')['unit_price'].agg(['mean','std']).reset_index()
display(unit_price_stats.head())

# Detecci√≥n de outliers con z-score
df_chip['z_score'] = (df_chip['unit_price'] - df_chip['unit_price'].mean()) / df_chip['unit_price'].std()
outliers = df_chip[np.abs(df_chip['z_score']) > 3]

print("Outliers detectados:", len(outliers))
display(outliers[['item_name', 'unit_price', 'z_score']].head())


Dimensiones del dataset: (4622, 5)
Tipos de datos:
order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

Primeras filas:


  df_chip['item_price_num'] = df_chip['item_price'].replace('[\$,]', '', regex=True).astype(float)


Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98



Valores nulos por columna:


Unnamed: 0,0
order_id,0
quantity,0
item_name,0
choice_description,1246
item_price,0


Unnamed: 0,item_name,item_price,quantity,line_total
0,Chips and Fresh Tomato Salsa,$2.39,1,2.39
1,Izze,$3.39,1,3.39
2,Nantucket Nectar,$3.39,1,3.39
3,Chips and Tomatillo-Green Chili Salsa,$2.39,1,2.39
4,Chicken Bowl,$16.98,2,33.96


Unnamed: 0,item_name,item_name_norm
0,Chips and Fresh Tomato Salsa,chips and fresh tomato salsa
1,Izze,izze
2,Nantucket Nectar,nantucket nectar
3,Chips and Tomatillo-Green Chili Salsa,chips and tomatillo-green chili salsa
4,Chicken Bowl,chicken bowl


Precios negativos: 0
Cantidades inv√°lidas: 0


Unnamed: 0,item_name_norm,mean,std
0,6 pack soft drink,6.610185,0.883177
1,barbacoa bowl,10.187273,1.260994
2,barbacoa burrito,9.832418,1.139519
3,barbacoa crispy tacos,10.928182,2.800739
4,barbacoa salad bowl,10.64,1.317616


Outliers detectados: 47


Unnamed: 0,item_name,unit_price,z_score
135,Chicken Salad Bowl,22.5,3.541506
213,Chicken Salad Bowl,22.5,3.541506
281,Steak Salad Bowl,23.78,3.842997
353,Steak Burrito,22.16,3.461422
409,Chicken Bowl,32.94,6.000547
