In [2]:
import pandas as pd
import numpy as np
from io import StringIO


Imagina que eres un investigador en el mundo de Pokémon y has recibido una base de datos con información desactualizada y desordenada sobre avistamientos de Pokémon. Tu misión es limpiar y actualizar esta base de datos para que pueda ser utilizada en un estudio sobre la población de Pokémon en la región.

Datos Iniciales

Los datos iniciales contienen las siguientes columnas:

    SightingDate: Fecha del avistamiento.
    TrainerID: Identificación del entrenador que reportó el avistamiento.
    PokemonName: Nombre del Pokémon avistado.
    CP: Puntos de combate del Pokémon reportado.
    HP: Puntos de salud del Pokémon reportado.
    Type: Tipo del Pokémon.
    Weather: Clima durante el avistamiento.

# 1 Carga de Datos

In [10]:
pip install chardet


Collecting chardet
  Obtaining dependency information for chardet from https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl.metadata
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m677.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.2.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import chardet

path_datos = 'pokemon.csv'

# Detecta la codificación del archivo CSV
with open(path_datos, 'rb') as f:
    result = chardet.detect(f.read())

print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [90]:
path_datos = 'pokemon.csv'

# Intenta leer el archivo CSV con codificación 'utf-8'

df = pd.read_csv(path_datos, encoding='ISO-8859-1')
df
# Ahora, 'df' debería contener tus datos correctamente


Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather
0,2023-11-08T14:00:00+0000,2023-11-08T14:00Z,TR123,Pikachu,500,35,Electric,Clear
1,2023-07-12T09:30:00+0100,2023-07-12T08:30Z,TR456,Eevee,MISSING,55,Normal,Cloudy
2,2023/02/23T22:15:00+0900,2023-02-23T13:15Z,TR789,Magicarp,1000,10,Water,Rain
3,2023-04-30T06:45:00-0400,2023-04-30T10:45Z,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY
4,2023-08-15T16:00:00+1000,,TR102,Bulbasaur,750,50,Grass/Poison,Sunny


# Limpieza de Datos

## 1 Normalizacion de Zonas Horarias

Normaliza la columna `'SightingTimeUTC'` a la zona horaria UTC y convierte `'SightingDate'` al mismo formato de tiempo.


In [91]:
from dateutil import parser
import pytz

from datetime import datetime

def convert_date_format(date_str):
    try:
        date_obj = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
        formatted_date = pytz.utc.localize(date_obj)
    except ValueError:
        try:
            date_obj = parser.parse(date_str)
            formatted_date = date_obj.astimezone(pytz.utc)
        except:
            return date_str
    return formatted_date


df['SightingDate'] = df['SightingDate'].apply(convert_date_format)
df['SightingDate'] = pd.to_datetime(df['SightingDate'], utc=True)
df['SightingTimeUTC'] = pd.to_datetime(df['SightingTimeUTC'], utc=True)



# Display the DataFrame with normalized UTC dates and times
df



Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500,35,Electric,Clear
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,TR456,Eevee,MISSING,55,Normal,Cloudy
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000,10,Water,Rain
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY
4,2023-08-15 06:00:00+00:00,NaT,TR102,Bulbasaur,750,50,Grass/Poison,Sunny


In [69]:
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500,35,Electric,Clear
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,TR456,Eevee,MISSING,55,Normal,Cloudy
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000,10,Water,Rain
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY
4,2023-08-15 06:00:00+00:00,NaT,TR102,Bulbasaur,750,50,Grass/Poison,Sunny


Compara si la fecha de la columna `'SightingDate'` coincide con la fecha en `'SightingTimeUTC'` una vez normalizada.


In [82]:
result = df['SightingDate'].dt.date == df['SightingTimeUTC'].dt.date
result

0     True
1     True
2     True
3     True
4    False
dtype: bool

Ajusta `'SightingTimeUTC'` a la zona horaria local de cada entrenador y crea una columna `'SightingTimeLocal'`.

In [92]:
df['SightingTimeLocal'] = df['SightingDate'].copy()


In [84]:
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,SightingTimeLocal
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500,35,Electric,Clear,2023-11-08 14:00:00+00:00
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,TR456,Eevee,MISSING,55,Normal,Cloudy,2023-07-12 08:30:00+00:00
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000,10,Water,Rain,2023-02-23 13:15:00+00:00
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,2023-04-30 10:45:00+00:00
4,2023-08-15 06:00:00+00:00,NaT,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,2023-08-15 06:00:00+00:00


Calcula el tiempo transcurrido desde el momento del avistamiento hasta `'ahora'` (tu hora local) y crea una columna `'TimeSinceSighting'`.

In [93]:
now = pd.Timestamp(datetime.now(), tz='UTC')
df['TimeSinceSighting'] = now - df['SightingDate']

In [86]:
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,SightingTimeLocal,TimeSinceSighting
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500,35,Electric,Clear,2023-11-08 14:00:00+00:00,5 days 04:48:14.114074
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,TR456,Eevee,MISSING,55,Normal,Cloudy,2023-07-12 08:30:00+00:00,124 days 10:18:14.114074
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000,10,Water,Rain,2023-02-23 13:15:00+00:00,263 days 05:33:14.114074
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,2023-04-30 10:45:00+00:00,197 days 08:03:14.114074
4,2023-08-15 06:00:00+00:00,NaT,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,2023-08-15 06:00:00+00:00,90 days 12:48:14.114074


## 2. Limpeiza de IDs

Llena los valores faltantes en `'TrainerID'` con el ID `'UNKNOWN'`.


In [94]:
df['TrainerID'] = df['TrainerID'].fillna('UKNOWN')

## 3. Corrección de Nombres de Pokémon

Asegúrate de que los nombres de Pokémon estén capitalizados correctamente.


In [95]:
#Itera sobre la columna 'PokemonName' asegurandote que esté en mayúsculas la primera letra de cada palabra:
df['PokémonName'] = df['PokémonName'].apply(lambda x: x.title())
df



Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,SightingTimeLocal,TimeSinceSighting
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500,35,Electric,Clear,2023-11-08 14:00:00+00:00,5 days 04:48:43.904806
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,TR456,Eevee,MISSING,55,Normal,Cloudy,2023-07-12 08:30:00+00:00,124 days 10:18:43.904806
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000,10,Water,Rain,2023-02-23 13:15:00+00:00,263 days 05:33:43.904806
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800,45,Ghost,PARTLY_CLOUDY,2023-04-30 10:45:00+00:00,197 days 08:03:43.904806
4,2023-08-15 06:00:00+00:00,NaT,TR102,Bulbasaur,750,50,Grass/Poison,Sunny,2023-08-15 06:00:00+00:00,90 days 12:48:43.904806


## 4. Conversión de 'CP' y 'HP' a Numéricos

Convierte `'CP'` y `'HP'` a valores numéricos, manejando los `'MISSING'` y comas como separadores de miles.

In [96]:
#Convierte el string 'MISSING' a NaN en la columna 'CP','HP':
df['CP'] = df['CP'].replace('MISSING', np.nan)
df['HP'] = df['HP'].replace('MISSING', np.nan)


#Convierte las comas en separadores de miles en la columna 'CP':
df['CP'] = df['CP'].str.replace(',', '')
df['HP'] = df['HP'].str.replace(',', '')

#Convierte en numérico
df['CP'] = pd.to_numeric(df['CP'], errors='coerce')
df['HP'] = pd.to_numeric(df['HP'], errors='coerce')

AttributeError: Can only use .str accessor with string values!

In [107]:
df['CP'] = pd.to_numeric(df['CP'], errors='coerce')
df['HP'] = pd.to_numeric(df['HP'], errors='coerce')

## 5. Estandarización de 'Type'

Divide la columna `'Type'` en `'PrimaryType'` y `'SecondaryType'` cuando hay dos tipos.


In [108]:
# Suponiendo que 'df' es tu DataFrame
df[['PrimaryType', 'SecondaryType']] = df['Type'].str.split('/', expand=True)



In [109]:
# Suponiendo que 'df' es tu DataFrame
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,SightingTimeLocal,TimeSinceSighting,PrimaryType,SecondaryType
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500.0,35,Electric,CLEAR,2023-11-08 14:00:00+00:00,5 days 04:48:43.904806,Electric,
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,TR456,Eevee,,55,Normal,CLOUDY,2023-07-12 08:30:00+00:00,124 days 10:18:43.904806,Normal,
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000.0,10,Water,RAIN,2023-02-23 13:15:00+00:00,263 days 05:33:43.904806,Water,
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800.0,45,Ghost,PARTLY_CLOUDY,2023-04-30 10:45:00+00:00,197 days 08:03:43.904806,Ghost,
4,2023-08-15 06:00:00+00:00,NaT,TR102,Bulbasaur,750.0,50,Grass/Poison,SUNNY,2023-08-15 06:00:00+00:00,90 days 12:48:43.904806,Grass,Poison


## 6. Corrección del Clima

Estándariza la columna `'Weather'` para que todos los valores sean mayúsculas.


In [110]:
df['Weather'] = df['Weather'].apply(lambda x: x.upper())


In [111]:
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,SightingTimeLocal,TimeSinceSighting,PrimaryType,SecondaryType
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500.0,35,Electric,CLEAR,2023-11-08 14:00:00+00:00,5 days 04:48:43.904806,Electric,
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,TR456,Eevee,,55,Normal,CLOUDY,2023-07-12 08:30:00+00:00,124 days 10:18:43.904806,Normal,
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000.0,10,Water,RAIN,2023-02-23 13:15:00+00:00,263 days 05:33:43.904806,Water,
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800.0,45,Ghost,PARTLY_CLOUDY,2023-04-30 10:45:00+00:00,197 days 08:03:43.904806,Ghost,
4,2023-08-15 06:00:00+00:00,NaT,TR102,Bulbasaur,750.0,50,Grass/Poison,SUNNY,2023-08-15 06:00:00+00:00,90 days 12:48:43.904806,Grass,Poison


# 3 Analisis de Datos

## Agrupaciones

Agrupa el DataFrame por `'Type'` y calcula la suma de `'CP'` para cada grupo.

In [112]:
result = df.groupby('PrimaryType')['CP'].sum().reset_index()
result


Unnamed: 0,PrimaryType,CP
0,Electric,500.0
1,Ghost,800.0
2,Grass,750.0
3,Normal,0.0
4,Water,1000.0


Después de la suma, agrega una columna que calcule la media de `'HP'` por cada `'Type'`, pero solo para aquellos Pokémon cuyo `'CP'` sea mayor que el promedio de `'CP'` de todo el DataFrame.

In [115]:
#sacamos la media de CP
mean = df['CP'].mean()
#filtramos para solo tener los CP mayor a la media
filt = df[df['CP'] > mean]
#Agrupamos por tipo en el df filtrado
grupo = filt.groupby('Type')
#sacamos el promedio de HP por los que son mayores a la media de CP
meanHP = grupo['HP'].mean()

#creamos nueva columna meanHP, mapeando los elementos de meanHP con los de type
df['meanHP'] = df['Type'].map(meanHP)
df

Unnamed: 0,SightingDate,SightingTimeUTC,TrainerID,PokémonName,CP,HP,Type,Weather,SightingTimeLocal,TimeSinceSighting,PrimaryType,SecondaryType,HP_mean_cp,HP_mean_cp.1,meanHP
0,2023-11-08 14:00:00+00:00,2023-11-08 14:00:00+00:00,TR123,Pikachu,500.0,35,Electric,CLEAR,2023-11-08 14:00:00+00:00,5 days 04:48:43.904806,Electric,,,,
1,2023-07-12 08:30:00+00:00,2023-07-12 08:30:00+00:00,TR456,Eevee,,55,Normal,CLOUDY,2023-07-12 08:30:00+00:00,124 days 10:18:43.904806,Normal,,,,
2,2023-02-23 13:15:00+00:00,2023-02-23 13:15:00+00:00,TR789,Magicarp,1000.0,10,Water,RAIN,2023-02-23 13:15:00+00:00,263 days 05:33:43.904806,Water,,10.0,10.0,10.0
3,2023-04-30 10:45:00+00:00,2023-04-30 10:45:00+00:00,TR101,Gengar,800.0,45,Ghost,PARTLY_CLOUDY,2023-04-30 10:45:00+00:00,197 days 08:03:43.904806,Ghost,,45.0,45.0,45.0
4,2023-08-15 06:00:00+00:00,NaT,TR102,Bulbasaur,750.0,50,Grass/Poison,SUNNY,2023-08-15 06:00:00+00:00,90 days 12:48:43.904806,Grass,Poison,,,
