In [147]:
import pandas as pd
import numpy as np
import re
from io import StringIO
import pytz


Imagina que eres un investigador en el mundo de Pokémon y has recibido una base de datos con información desactualizada y desordenada sobre avistamientos de Pokémon. Tu misión es limpiar y actualizar esta base de datos para que pueda ser utilizada en un estudio sobre la población de Pokémon en la región.

Datos Iniciales

Los datos iniciales contienen las siguientes columnas:

    SightingDate: Fecha del avistamiento.
    TrainerID: Identificación del entrenador que reportó el avistamiento.
    PokemonName: Nombre del Pokémon avistado.
    CP: Puntos de combate del Pokémon reportado.
    HP: Puntos de salud del Pokémon reportado.
    Type: Tipo del Pokémon.
    Weather: Clima durante el avistamiento.

# 1 Carga de Datos

In [144]:
path_datos = 'pokemon.csv'
# Lee el csv con su respectivo encoding
df = pd.read_csv(path_datos, encoding='ISO-8859-1')
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather
0,2023-11-08T14:00:00+0000,2023-11-08T14:00Z,TR123,Pikachu,500,35,Electric,Clear
1,2023-07-12T09:30:00+0100,2023-07-12T08:30Z,TR456,Eevee,MISSING,55,Normal,Cloudy
2,2023/02/23T22:15:00+0900,2023-02-23T13:15Z,TR789,Magicarp,1000,10,Water,Rain
3,2023-04-30T06:45:00-0400,2023-04-30T10:45Z,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY
4,2023-08-15T16:00:00+1000,,TR102,Bulbasaur,750,50,Grass/Poison,Sunny


# Limpieza de Datos

## 1 Normalizacion de Zonas Horarias

Normaliza la columna `'SightingTimeUTC'` a la zona horaria UTC y convierte `'SightingDate'` al mismo formato de tiempo.

In [145]:
# Convertir la columna 'SightingTimeUTC' al formato deseado
from datetime import datetime
import re
df['SightingTimeUTC'] = pd.to_datetime(df['SightingTimeUTC'].str.replace("Z", "+00:00"), utc=True)
df['SightingDate'] = df['SightingDate'].apply(lambda x: x.replace('T', ' '))
def reemplazar_zona(cadena):
    return re.sub(r'(\d{2})$', ':00', cadena)
df['SightingDate'] = df['SightingDate'].apply(reemplazar_zona)
def convertir_a_formato_deseado(cadena):
    # Utilizar expresiones regulares para extraer componentes de la fecha y la hora
    match = re.match(r'(\d{4})[-/](\d{2})[-/](\d{2}) (\d{2}:\d{2}:\d{2})([+-]\d{2}:\d{2})', cadena)
    if match:
        year, month, day, time, offset = match.groups()
        # Convertir el offset a formato positivo
        offset = f"+{offset[1:]}" if offset.startswith('-') else offset
        # Formatear en el estilo deseado
        return f"{year}-{month}-{day} {time}{offset}"
    else:
        return None
df['SightingDate'] = df['SightingDate'].apply(convertir_a_formato_deseado)
df['SightingDate'] = pd.to_datetime(df['SightingDate'])
df

  df['SightingDate'] = pd.to_datetime(df['SightingDate'])


Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500,35,Electric,Clear
1,2023-07-12 09:30:00+01:00,2023-07-12 08:30:00+00:00,TR456,Eevee,MISSING,55,Normal,Cloudy
2,2023-02-23 22:15:00+09:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000,10,Water,Rain
3,2023-04-30 06:45:00+04:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY
4,2023-08-15 16:00:00+10:00,NaT,TR102,Bulbasaur,750,50,Grass/Poison,Sunny


Compara si la fecha de la columna `'SightingDate'` coincide con la fecha en `'SightingTimeUTC'` una vez normalizada.

In [146]:
# Pista: Puedes crear una columna nueva para el resultado de la comparación.
df['DateMatch'] = df['SightingDate'] == df['SightingTimeUTC']
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,DateMatch
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500,35,Electric,Clear,True
1,2023-07-12 09:30:00+01:00,2023-07-12 08:30:00+00:00,TR456,Eevee,MISSING,55,Normal,Cloudy,True
2,2023-02-23 22:15:00+09:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000,10,Water,Rain,True
3,2023-04-30 06:45:00+04:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,False
4,2023-08-15 16:00:00+10:00,NaT,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,False


Ajusta `'SightingTimeUTC'` a la zona horaria local de cada entrenador y crea una columna `'SightingTimeLocal'`.

In [149]:
# Función para ajustar a la zona horaria local
def ajustar_a_zona_horaria_local(row):
    sighting_time_utc = pd.to_datetime(row['SightingTimeUTC'], utc=True, errors='coerce')
    if pd.isna(sighting_time_utc):
        return pd.NaT
    else:
        sighting_date = pd.to_datetime(row['SightingDate'])
        local_timezone = sighting_date.tzinfo
        return sighting_time_utc.astimezone(local_timezone)

# Aplicar la función a la columna 'SightingTimeUTC' para obtener 'SightingTimeLocal'
df['SightingTimeLocal'] = df.apply(ajustar_a_zona_horaria_local, axis=1)
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,DateMatch,SightingTimeLocal
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500,35,Electric,Clear,True,2023-11-08 14:00:00+00:00
1,2023-07-12 09:30:00+01:00,2023-07-12 08:30:00+00:00,TR456,Eevee,MISSING,55,Normal,Cloudy,True,2023-07-12 09:30:00+01:00
2,2023-02-23 22:15:00+09:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000,10,Water,Rain,True,2023-02-23 22:15:00+09:00
3,2023-04-30 06:45:00+04:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,False,2023-04-30 14:45:00+04:00
4,2023-08-15 16:00:00+10:00,NaT,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,False,NaT


Calcula el tiempo transcurrido desde el momento del avistamiento hasta `'ahora'` (tu hora local) y crea una columna `'TimeSinceSighting'`.

In [150]:
df['TimeSinceSighting'] = pd.Timestamp(datetime.now(), tz='UTC')- df['SightingDate'] 
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,DateMatch,SightingTimeLocal,TimeSinceSighting
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500,35,Electric,Clear,True,2023-11-08 14:00:00+00:00,5 days 03:08:17.171801
1,2023-07-12 09:30:00+01:00,2023-07-12 08:30:00+00:00,TR456,Eevee,MISSING,55,Normal,Cloudy,True,2023-07-12 09:30:00+01:00,124 days 08:38:17.171801
2,2023-02-23 22:15:00+09:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000,10,Water,Rain,True,2023-02-23 22:15:00+09:00,263 days 03:53:17.171801
3,2023-04-30 06:45:00+04:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,False,2023-04-30 14:45:00+04:00,197 days 14:23:17.171801
4,2023-08-15 16:00:00+10:00,NaT,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,False,NaT,90 days 11:08:17.171801


## 2. Limpeiza de IDs

Llena los valores faltantes en `'TrainerID'` con el ID `'UNKNOWN'`.


In [151]:
df['TrainerID'] = df['TrainerID'].fillna('UKNOWN')
df


Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,DateMatch,SightingTimeLocal,TimeSinceSighting
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500,35,Electric,Clear,True,2023-11-08 14:00:00+00:00,5 days 03:08:17.171801
1,2023-07-12 09:30:00+01:00,2023-07-12 08:30:00+00:00,TR456,Eevee,MISSING,55,Normal,Cloudy,True,2023-07-12 09:30:00+01:00,124 days 08:38:17.171801
2,2023-02-23 22:15:00+09:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000,10,Water,Rain,True,2023-02-23 22:15:00+09:00,263 days 03:53:17.171801
3,2023-04-30 06:45:00+04:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,False,2023-04-30 14:45:00+04:00,197 days 14:23:17.171801
4,2023-08-15 16:00:00+10:00,NaT,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,False,NaT,90 days 11:08:17.171801


## 3. Corrección de Nombres de Pokémon

Asegúrate de que los nombres de Pokémon estén capitalizados correctamente.


In [159]:
df['PokémonName'] = df['PokémonName'].str.capitalize()
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,DateMatch,SightingTimeLocal,TimeSinceSighting
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500.0,35,Electric,Clear,True,2023-11-08 14:00:00+00:00,5 days 03:08:17.171801
1,2023-07-12 09:30:00+01:00,2023-07-12 08:30:00+00:00,TR456,Eevee,,55,Normal,Cloudy,True,2023-07-12 09:30:00+01:00,124 days 08:38:17.171801
2,2023-02-23 22:15:00+09:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000.0,10,Water,Rain,True,2023-02-23 22:15:00+09:00,263 days 03:53:17.171801
3,2023-04-30 06:45:00+04:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800.0,45,Ghost,PARTLY_CLOUDY,False,2023-04-30 14:45:00+04:00,197 days 14:23:17.171801
4,2023-08-15 16:00:00+10:00,NaT,TR102,Bulbasaur,750.0,50,Grass/Poison,Sunny,False,NaT,90 days 11:08:17.171801


## 4. Conversión de 'CP' y 'HP' a Numéricos

Convierte `'CP'` y `'HP'` a valores numéricos, manejando los `'MISSING'` y comas como separadores de miles.

In [162]:
df['CP'] = pd.to_numeric(df['CP'].astype(str).str.replace(',', ''), errors='coerce').replace('MISSING', pd.NA)
df['HP'] = pd.to_numeric(df['HP'].astype(str).str.replace(',', ''), errors='coerce').replace('MISSING', pd.NA)

df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,DateMatch,SightingTimeLocal,TimeSinceSighting
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500.0,35,Electric,Clear,True,2023-11-08 14:00:00+00:00,5 days 03:08:17.171801
1,2023-07-12 09:30:00+01:00,2023-07-12 08:30:00+00:00,TR456,Eevee,,55,Normal,Cloudy,True,2023-07-12 09:30:00+01:00,124 days 08:38:17.171801
2,2023-02-23 22:15:00+09:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000.0,10,Water,Rain,True,2023-02-23 22:15:00+09:00,263 days 03:53:17.171801
3,2023-04-30 06:45:00+04:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800.0,45,Ghost,PARTLY_CLOUDY,False,2023-04-30 14:45:00+04:00,197 days 14:23:17.171801
4,2023-08-15 16:00:00+10:00,NaT,TR102,Bulbasaur,750.0,50,Grass/Poison,Sunny,False,NaT,90 days 11:08:17.171801


## 5. Estandarización de 'Type'

Divide la columna `'Type'` en `'PrimaryType'` y `'SecondaryType'` cuando hay dos tipos.


In [163]:
df[['PrimaryType', 'SecondaryType']] = df['Type'].str.split('/', expand=True)
df


Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,DateMatch,SightingTimeLocal,TimeSinceSighting,PrimaryType,SecondaryType
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500.0,35,Electric,Clear,True,2023-11-08 14:00:00+00:00,5 days 03:08:17.171801,Electric,
1,2023-07-12 09:30:00+01:00,2023-07-12 08:30:00+00:00,TR456,Eevee,,55,Normal,Cloudy,True,2023-07-12 09:30:00+01:00,124 days 08:38:17.171801,Normal,
2,2023-02-23 22:15:00+09:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000.0,10,Water,Rain,True,2023-02-23 22:15:00+09:00,263 days 03:53:17.171801,Water,
3,2023-04-30 06:45:00+04:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800.0,45,Ghost,PARTLY_CLOUDY,False,2023-04-30 14:45:00+04:00,197 days 14:23:17.171801,Ghost,
4,2023-08-15 16:00:00+10:00,NaT,TR102,Bulbasaur,750.0,50,Grass/Poison,Sunny,False,NaT,90 days 11:08:17.171801,Grass,Poison


## 6. Corrección del Clima

Estándariza la columna `'Weather'` para que todos los valores sean mayúsculas.


In [165]:
df['Weather'] = df['Weather'].str.upper()
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,DateMatch,SightingTimeLocal,TimeSinceSighting,PrimaryType,SecondaryType
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500.0,35,Electric,CLEAR,True,2023-11-08 14:00:00+00:00,5 days 03:08:17.171801,Electric,
1,2023-07-12 09:30:00+01:00,2023-07-12 08:30:00+00:00,TR456,Eevee,,55,Normal,CLOUDY,True,2023-07-12 09:30:00+01:00,124 days 08:38:17.171801,Normal,
2,2023-02-23 22:15:00+09:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000.0,10,Water,RAIN,True,2023-02-23 22:15:00+09:00,263 days 03:53:17.171801,Water,
3,2023-04-30 06:45:00+04:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800.0,45,Ghost,PARTLY_CLOUDY,False,2023-04-30 14:45:00+04:00,197 days 14:23:17.171801,Ghost,
4,2023-08-15 16:00:00+10:00,NaT,TR102,Bulbasaur,750.0,50,Grass/Poison,SUNNY,False,NaT,90 days 11:08:17.171801,Grass,Poison


# 3 Analisis de Datos

## Agrupaciones

Agrupa el DataFrame por `'Type'` y calcula la suma de `'CP'` para cada grupo.

In [166]:
# Tu codigo aqui
# Convertir 'CP' a valores numéricos
df['CP'] = pd.to_numeric(df['CP'].replace('MISSING', pd.NA), errors='coerce')

# Agrupar por 'Type' y calcular la suma de 'CP' para cada grupo
sum_cp_by_type = df.groupby('Type')['CP'].sum().reset_index()

# Mostrar el resultado
print(sum_cp_by_type)

           Type      CP
0      Electric   500.0
1         Ghost   800.0
2  Grass/Poison   750.0
3        Normal     0.0
4         Water  1000.0


Después de la suma, agrega una columna que calcule la media de `'HP'` por cada `'Type'`, pero solo para aquellos Pokémon cuyo `'CP'` sea mayor que el promedio de `'CP'` de todo el DataFrame.

In [168]:
# Convertir 'CP' y 'HP' a valores numéricos
df['CP'] = pd.to_numeric(df['CP'].replace('MISSING', pd.NA), errors='coerce')
df['HP'] = pd.to_numeric(df['HP'].replace('MISSING', pd.NA), errors='coerce')

# Calcular el promedio de 'CP' de todo el DataFrame
cp_mean = df['CP'].mean()

# Filtrar los Pokémon cuyo 'CP' es mayor que el promedio de 'CP'
filtered_df = df[df['CP'] > cp_mean]

# Agrupar por 'Type' y calcular la media de 'HP' para cada grupo
mean_hp_by_type = filtered_df.groupby('Type')['HP'].mean().reset_index()

# Agregar una columna al DataFrame original con la media de 'HP' para cada 'Type'
df = pd.merge(df, mean_hp_by_type, on='Type', how='left', suffixes=('', '_MeanHP'))
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,DateMatch,SightingTimeLocal,TimeSinceSighting,PrimaryType,SecondaryType,HP_MeanHP,HP_MeanHP.1
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500.0,35,Electric,CLEAR,True,2023-11-08 14:00:00+00:00,5 days 03:08:17.171801,Electric,,,
1,2023-07-12 09:30:00+01:00,2023-07-12 08:30:00+00:00,TR456,Eevee,,55,Normal,CLOUDY,True,2023-07-12 09:30:00+01:00,124 days 08:38:17.171801,Normal,,,
2,2023-02-23 22:15:00+09:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000.0,10,Water,RAIN,True,2023-02-23 22:15:00+09:00,263 days 03:53:17.171801,Water,,10.0,10.0
3,2023-04-30 06:45:00+04:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800.0,45,Ghost,PARTLY_CLOUDY,False,2023-04-30 14:45:00+04:00,197 days 14:23:17.171801,Ghost,,45.0,45.0
4,2023-08-15 16:00:00+10:00,NaT,TR102,Bulbasaur,750.0,50,Grass/Poison,SUNNY,False,NaT,90 days 11:08:17.171801,Grass,Poison,,
