In [460]:
import pandas as pd
import numpy as np
import re # regular expressions
import pytz
from datetime import datetime
from io import StringIO


Imagina que eres un investigador en el mundo de Pokémon y has recibido una base de datos con información desactualizada y desordenada sobre avistamientos de Pokémon. Tu misión es limpiar y actualizar esta base de datos para que pueda ser utilizada en un estudio sobre la población de Pokémon en la región.

Datos Iniciales

Los datos iniciales contienen las siguientes columnas:

    SightingDate: Fecha del avistamiento.
    TrainerID: Identificación del entrenador que reportó el avistamiento.
    PokemonName: Nombre del Pokémon avistado.
    CP: Puntos de combate del Pokémon reportado.
    HP: Puntos de salud del Pokémon reportado.
    Type: Tipo del Pokémon.
    Weather: Clima durante el avistamiento.

# 1 Carga de Datos

In [461]:
encoding = 'ISO-8859-1'
path_datos = 'pokemon.csv'
# TODO completa el codigo para cargar los datos
with open(path_datos, 'r', encoding='utf-8', errors='replace') as file:
    df = pd.read_csv(file)
df.rename(columns={df.columns[3]: 'PokemonName'}, inplace=True)
df


Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokemonName,CP,HP,Type,Weather
0,2023-11-08T14:00:00+0000,2023-11-08T14:00Z,TR123,Pikachu,500,35,Electric,Clear
1,2023-07-12T09:30:00+0100,2023-07-12T08:30Z,TR456,Eevee,MISSING,55,Normal,Cloudy
2,2023/02/23T22:15:00+0900,2023-02-23T13:15Z,TR789,Magicarp,1000,10,Water,Rain
3,2023-04-30T06:45:00-0400,2023-04-30T10:45Z,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY
4,2023-08-15T16:00:00+1000,,TR102,Bulbasaur,750,50,Grass/Poison,Sunny


# Limpieza de Datos

## 1 Normalizacion de Zonas Horarias

Normaliza la columna `'SightingTimeUTC'` a la zona horaria UTC y convierte `'SightingDate'` al mismo formato de tiempo.


In [462]:
df['SightingTimesLocal'] = df['SightingDate'].copy()
df['SightingTimeUTC'] = pd.to_datetime(df['SightingTimeUTC'], utc=True)

pattern = r'(\d{4})/(\d{2})/(\d{2})'
def replace_date_format(date_str):
    return re.sub(pattern, r'\1-\2-\3', date_str)
df['SightingDate'] = df['SightingDate'].apply(replace_date_format)
df['SightingTimesLocal'] = df['SightingTimesLocal'].apply(replace_date_format)

df['SightingDate'] = pd.to_datetime(df['SightingDate'], utc=True)
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokemonName,CP,HP,Type,Weather,SightingTimesLocal
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500,35,Electric,Clear,2023-11-08T14:00:00+0000
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,TR456,Eevee,MISSING,55,Normal,Cloudy,2023-07-12T09:30:00+0100
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000,10,Water,Rain,2023-02-23T22:15:00+0900
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,2023-04-30T06:45:00-0400
4,2023-08-15 06:00:00+00:00,NaT,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,2023-08-15T16:00:00+1000


Compara si la fecha de la columna `'SightingDate'` coincide con la fecha en `'SightingTimeUTC'` una vez normalizada.


In [463]:
#Pista: Puedes crear una columna nueva para el resultado de la comparación.
df['DateMatch'] = df['SightingDate'] == df['SightingTimeUTC']
#last_column = df.pop('DateMatch')
#df.insert(2, 'DateMatch', last_column)
df.insert(2, 'DateMatch', df.pop('DateMatch'))

df



Unnamed: 0,SightingDate,SightingTimeUTC,DateMatch,TrainerID,PokemonName,CP,HP,Type,Weather,SightingTimesLocal
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,True,TR123,Pikachu,500,35,Electric,Clear,2023-11-08T14:00:00+0000
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,True,TR456,Eevee,MISSING,55,Normal,Cloudy,2023-07-12T09:30:00+0100
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,True,TR789,Magicarp,1000,10,Water,Rain,2023-02-23T22:15:00+0900
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,True,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,2023-04-30T06:45:00-0400
4,2023-08-15 06:00:00+00:00,NaT,False,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,2023-08-15T16:00:00+1000


Ajusta `'SightingTimeUTC'` a la zona horaria local de cada entrenador y crea una columna `'SightingTimeLocal'`.

In [464]:
#Hice una columna adicional para poder ver eso antes de alterar SightingDate
df

Unnamed: 0,SightingDate,SightingTimeUTC,DateMatch,TrainerID,PokemonName,CP,HP,Type,Weather,SightingTimesLocal
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,True,TR123,Pikachu,500,35,Electric,Clear,2023-11-08T14:00:00+0000
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,True,TR456,Eevee,MISSING,55,Normal,Cloudy,2023-07-12T09:30:00+0100
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,True,TR789,Magicarp,1000,10,Water,Rain,2023-02-23T22:15:00+0900
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,True,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,2023-04-30T06:45:00-0400
4,2023-08-15 06:00:00+00:00,NaT,False,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,2023-08-15T16:00:00+1000


Calcula el tiempo transcurrido desde el momento del avistamiento hasta `'ahora'` (tu hora local) y crea una columna `'TimeSinceSighting'`.

In [465]:
now = pd.Timestamp(datetime.now(), tz='UTC').tz_convert('America/Mexico_City')
df['TimeSinceSighting'] = now - df['SightingDate']
df

Unnamed: 0,SightingDate,SightingTimeUTC,DateMatch,TrainerID,PokemonName,CP,HP,Type,Weather,SightingTimesLocal,TimeSinceSighting
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,True,TR123,Pikachu,500,35,Electric,Clear,2023-11-08T14:00:00+0000,1 days 09:11:54.033223
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,True,TR456,Eevee,MISSING,55,Normal,Cloudy,2023-07-12T09:30:00+0100,120 days 14:41:54.033223
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,True,TR789,Magicarp,1000,10,Water,Rain,2023-02-23T22:15:00+0900,259 days 09:56:54.033223
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,True,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,2023-04-30T06:45:00-0400,193 days 12:26:54.033223
4,2023-08-15 06:00:00+00:00,NaT,False,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,2023-08-15T16:00:00+1000,86 days 17:11:54.033223


## 2. Limpeiza de IDsz

Llena los valores faltantes en `'TrainerID'` con el ID `'UNKNOWN'`.


In [466]:
df['TrainerID'] = df['TrainerID'].fillna('UNKNOWN')
df

Unnamed: 0,SightingDate,SightingTimeUTC,DateMatch,TrainerID,PokemonName,CP,HP,Type,Weather,SightingTimesLocal,TimeSinceSighting
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,True,TR123,Pikachu,500,35,Electric,Clear,2023-11-08T14:00:00+0000,1 days 09:11:54.033223
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,True,TR456,Eevee,MISSING,55,Normal,Cloudy,2023-07-12T09:30:00+0100,120 days 14:41:54.033223
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,True,TR789,Magicarp,1000,10,Water,Rain,2023-02-23T22:15:00+0900,259 days 09:56:54.033223
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,True,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,2023-04-30T06:45:00-0400,193 days 12:26:54.033223
4,2023-08-15 06:00:00+00:00,NaT,False,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,2023-08-15T16:00:00+1000,86 days 17:11:54.033223


## 3. Corrección de Nombres de Pokémon

Asegúrate de que los nombres de Pokémon estén capitalizados correctamente.


In [467]:
df['PokemonName'] = df['PokemonName'].str.capitalize()
df

Unnamed: 0,SightingDate,SightingTimeUTC,DateMatch,TrainerID,PokemonName,CP,HP,Type,Weather,SightingTimesLocal,TimeSinceSighting
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,True,TR123,Pikachu,500,35,Electric,Clear,2023-11-08T14:00:00+0000,1 days 09:11:54.033223
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,True,TR456,Eevee,MISSING,55,Normal,Cloudy,2023-07-12T09:30:00+0100,120 days 14:41:54.033223
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,True,TR789,Magicarp,1000,10,Water,Rain,2023-02-23T22:15:00+0900,259 days 09:56:54.033223
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,True,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,2023-04-30T06:45:00-0400,193 days 12:26:54.033223
4,2023-08-15 06:00:00+00:00,NaT,False,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,2023-08-15T16:00:00+1000,86 days 17:11:54.033223


## 4. Conversión de 'CP' y 'HP' a Numéricos

Convierte `'CP'` y `'HP'` a valores numéricos, manejando los `'MISSING'` y comas como separadores de miles.

In [468]:
df['CP'] = df['CP'].astype(str)
df['HP'] = df['HP'].astype(str)
# Los volvemos cadenas para poder usar el replace
df['CP'] = pd.to_numeric(df['CP'].str.replace(',', ''), errors='coerce')
df['HP'] = pd.to_numeric(df['HP'].str.replace(',', ''), errors='coerce')
df

Unnamed: 0,SightingDate,SightingTimeUTC,DateMatch,TrainerID,PokemonName,CP,HP,Type,Weather,SightingTimesLocal,TimeSinceSighting
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,True,TR123,Pikachu,500.0,35,Electric,Clear,2023-11-08T14:00:00+0000,1 days 09:11:54.033223
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,True,TR456,Eevee,,55,Normal,Cloudy,2023-07-12T09:30:00+0100,120 days 14:41:54.033223
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,True,TR789,Magicarp,1000.0,10,Water,Rain,2023-02-23T22:15:00+0900,259 days 09:56:54.033223
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,True,TR101,Gengar,800.0,45,Ghost,PARTLY_CLOUDY,2023-04-30T06:45:00-0400,193 days 12:26:54.033223
4,2023-08-15 06:00:00+00:00,NaT,False,TR102,Bulbasaur,750.0,50,Grass/Poison,Sunny,2023-08-15T16:00:00+1000,86 days 17:11:54.033223


## 5. Estandarización de 'Type'

Divide la columna `'Type'` en `'PrimaryType'` y `'SecondaryType'` cuando hay dos tipos.


In [469]:
df[['PrimaryType', 'SecondaryType']] = df['Type'].str.split('/', expand=True)
df.insert(8, 'SecondaryType', df.pop('SecondaryType'))
df

Unnamed: 0,SightingDate,SightingTimeUTC,DateMatch,TrainerID,PokemonName,CP,HP,Type,SecondaryType,Weather,SightingTimesLocal,TimeSinceSighting,PrimaryType
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,True,TR123,Pikachu,500.0,35,Electric,,Clear,2023-11-08T14:00:00+0000,1 days 09:11:54.033223,Electric
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,True,TR456,Eevee,,55,Normal,,Cloudy,2023-07-12T09:30:00+0100,120 days 14:41:54.033223,Normal
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,True,TR789,Magicarp,1000.0,10,Water,,Rain,2023-02-23T22:15:00+0900,259 days 09:56:54.033223,Water
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,True,TR101,Gengar,800.0,45,Ghost,,PARTLY_CLOUDY,2023-04-30T06:45:00-0400,193 days 12:26:54.033223,Ghost
4,2023-08-15 06:00:00+00:00,NaT,False,TR102,Bulbasaur,750.0,50,Grass/Poison,Poison,Sunny,2023-08-15T16:00:00+1000,86 days 17:11:54.033223,Grass


## 6. Corrección del Clima

Estándariza la columna `'Weather'` para que todos los valores sean mayúsculas.


In [474]:
df['Weather'] = df['Weather'].str.upper()
df

Unnamed: 0,SightingDate,SightingTimeUTC,DateMatch,TrainerID,PokemonName,CP,HP,Type,SecondaryType,Weather,SightingTimesLocal,TimeSinceSighting,PrimaryType
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,True,TR123,Pikachu,500.0,35,Electric,,CLEAR,2023-11-08T14:00:00+0000,1 days 09:11:54.033223,Electric
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,True,TR456,Eevee,,55,Normal,,CLOUDY,2023-07-12T09:30:00+0100,120 days 14:41:54.033223,Normal
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,True,TR789,Magicarp,1000.0,10,Water,,RAIN,2023-02-23T22:15:00+0900,259 days 09:56:54.033223,Water
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,True,TR101,Gengar,800.0,45,Ghost,,PARTLY_CLOUDY,2023-04-30T06:45:00-0400,193 days 12:26:54.033223,Ghost
4,2023-08-15 06:00:00+00:00,NaT,False,TR102,Bulbasaur,750.0,50,Grass/Poison,Poison,SUNNY,2023-08-15T16:00:00+1000,86 days 17:11:54.033223,Grass


# 3 Analisis de Datos

## Agrupaciones

Agrupa el DataFrame por `'Type'` y calcula la suma de `'CP'` para cada grupo.

In [471]:
suma = df.groupby('Type')['CP'].sum()
print(suma)

Type
Electric         500.0
Ghost            800.0
Grass/Poison     750.0
Normal             0.0
Water           1000.0
Name: CP, dtype: float64


Después de la suma, agrega una columna que calcule la media de `'HP'` por cada `'Type'`, pero solo para aquellos Pokémon cuyo `'CP'` sea mayor que el promedio de `'CP'` de todo el DataFrame.

In [476]:

cp_promedio = df['CP'].mean()
pokemon_filtrados = df[df['CP'] > cp_promedio]
media_hp_por_type = pokemon_filtrados.groupby('Type')['HP'].mean().reset_index()
media_hp_por_type.rename(columns={'HP': 'Media_HP_por_Type'}, inplace=True)
print(media_hp_por_type)

    Type  Media_HP_por_Type
0  Ghost               45.0
1  Water               10.0
