In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Inspect and Clean the Data

In [64]:
path = "../data/pokemon.csv"
pokemon = pd.read_csv(path)
pokemon.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


In [8]:
pokemon.shape, pokemon.columns

((801, 41),
 Index(['abilities', 'against_bug', 'against_dark', 'against_dragon',
        'against_electric', 'against_fairy', 'against_fight', 'against_fire',
        'against_flying', 'against_ghost', 'against_grass', 'against_ground',
        'against_ice', 'against_normal', 'against_poison', 'against_psychic',
        'against_rock', 'against_steel', 'against_water', 'attack',
        'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
        'classfication', 'defense', 'experience_growth', 'height_m', 'hp',
        'japanese_name', 'name', 'percentage_male', 'pokedex_number',
        'sp_attack', 'sp_defense', 'speed', 'type1', 'type2', 'weight_kg',
        'generation', 'is_legendary'],
       dtype='object'))

In [9]:
pokemon.name.head(10)

0     Bulbasaur
1       Ivysaur
2      Venusaur
3    Charmander
4    Charmeleon
5     Charizard
6      Squirtle
7     Wartortle
8     Blastoise
9      Caterpie
Name: name, dtype: object

In [5]:
pokemon.isnull().sum()[pokemon.isnull().sum() != 0]

height_m            20
percentage_male     98
type2              384
weight_kg           20
dtype: int64

In [13]:
pokemon.loc[pokemon['type2'].isnull(), 'name'].sample(10)

421       Shellos
496     Serperior
513      Simisear
745    Wishiwashi
142       Snorlax
601        Tynamo
502      Samurott
448    Hippopotas
494         Snivy
619      Mienshao
Name: name, dtype: object

In [14]:
pokemon.loc[pokemon['percentage_male'].isnull(), 'name'].sample(10)

645       Kyurem
716      Yveltal
491      Shaymin
337      Solrock
482       Dialga
795    Xurkitree
492       Arceus
794    Pheromosa
249        Ho-Oh
800     Magearna
Name: name, dtype: object

In [15]:
pokemon.loc[pokemon['height_m'].isnull(), 'name'].sample(10)

25        Raichu
104      Marowak
18       Rattata
19      Raticate
50       Dugtrio
75         Golem
36        Vulpix
73       Geodude
26     Sandshrew
51        Meowth
Name: name, dtype: object

In [17]:
(pokemon.loc[pokemon['height_m'].isnull(), 'name'] == \
pokemon.loc[pokemon['weight_kg'].isnull(), 'name']).all()

np.True_

About the missing values:
1. For col `type2`, NaN refers to a legitimate value of the feature. <br>
2. For cols `height_m` and `weight_kg`, NaN is due to a problem in data collection. The same pokemons have missing values in the two columns. Since they are few (20 out of 801), the columns are not removed and values will be imputed. <br>
3. For col `percentage_male`, NaN refers to the fact that the column is not applicable to the pokemon.

In [18]:
pokemon.dtypes

abilities             object
against_bug          float64
against_dark         float64
against_dragon       float64
against_electric     float64
against_fairy        float64
against_fight        float64
against_fire         float64
against_flying       float64
against_ghost        float64
against_grass        float64
against_ground       float64
against_ice          float64
against_normal       float64
against_poison       float64
against_psychic      float64
against_rock         float64
against_steel        float64
against_water        float64
attack                 int64
base_egg_steps         int64
base_happiness         int64
base_total             int64
capture_rate          object
classfication         object
defense                int64
experience_growth      int64
height_m             float64
hp                     int64
japanese_name         object
name                  object
percentage_male      float64
pokedex_number         int64
sp_attack              int64
sp_defense    

col `capture_rate` is object instead of integer, and there is a spelling mistake in col `classfication`. <br>
Save the original data set and start modifying on a copy.

In [19]:
original_pokemon = pokemon.copy()

In [20]:
pokemon['capture_rate'].unique()

array(['45', '255', '120', '127', '90', '190', '75', '235', '150', '25',
       '170', '50', '200', '100', '180', '60', '225', '30', '35', '3',
       '65', '70', '125', '205', '155', '145', '130', '140', '15', '220',
       '160', '80', '55', '30 (Meteorite)255 (Core)'], dtype=object)

In [21]:
pokemon.loc[pokemon['capture_rate'] == '30 (Meteorite)255 (Core)', 'name']

773    Minior
Name: name, dtype: object

Split Minior in two pokemon

In [22]:
pokemon.loc[pokemon['name'] == 'Minior', 'speed']

773    120
Name: speed, dtype: int64

In [23]:
pokemon = pd.concat([pokemon, pokemon.loc[pokemon['name'] == 'Minior', :]], axis = 0)

In [None]:
%%capture --no-display

pokemon['name'].iloc[-1, ] = 'Minior-Meteorite'
pokemon['speed'].iloc[-1, ] = 60 #looked on smogon
pokemon['base_total'].iloc[-1, ] -= pokemon.loc[pokemon['name'] == 'Minior', 'speed'] - 60
pokemon.loc[(pokemon['name'] == 'Minior') & (pokemon['speed'] == 120), 'capture_rate'] = '255'
pokemon.loc[(pokemon['name'] == 'Minior-Meteorite') & (pokemon['speed'] == 60), 'capture_rate'] = '30'

In [26]:
pokemon.loc[[772, 773, 774], 'name'], pokemon.iloc[[772, 773, 774], :]['name']

(772            Silvally
 773              Minior
 773    Minior-Meteorite
 774              Komala
 Name: name, dtype: object,
 772    Silvally
 773      Minior
 774      Komala
 Name: name, dtype: object)

In [27]:
pokemon.sort_index(inplace = True)

In [29]:
pokemon.reset_index(inplace = True, drop = True)

In [30]:
pokemon.loc[[772, 773, 774], 'name'], pokemon.iloc[[772, 773, 774], :]['name']

(772            Silvally
 773              Minior
 774    Minior-Meteorite
 Name: name, dtype: object,
 772            Silvally
 773              Minior
 774    Minior-Meteorite
 Name: name, dtype: object)

By doing this, the column `pokedex_number` is not a key anymore:

In [31]:
pokemon.pokedex_number.duplicated().sum()

np.int64(1)

Finally:

In [34]:
pokemon.rename({'classfication':'classification'}, axis = 1, inplace = True)
pokemon['capture_rate'] = pokemon['capture_rate'].astype(int)

Delete japanese_name column

In [35]:
pokemon.drop('japanese_name', axis = 1, inplace = True)
'japanese_name' in pokemon.columns

False

We now work with missing values.

First, for col `percentage_male`: we add a new column `percentage_female`. Then we set the NaN to zero:

In [36]:
pokemon['percentage_female'] = 100 - pokemon['percentage_male']
pokemon['percentage_female']

0      11.9
1      11.9
2      11.9
3      11.9
4      11.9
       ... 
797     NaN
798     NaN
799     NaN
800     NaN
801     NaN
Name: percentage_female, Length: 802, dtype: float64

In [37]:
pokemon.loc[pokemon['percentage_male'].isna(), 'percentage_male'] = 0
pokemon.loc[pokemon['percentage_female'].isna(), 'percentage_female'] = 0

Substitute a placeholder in col `type2`:

In [38]:
pokemon.loc[pokemon['type2'].isna(), 'type2'] = 'none'

For the weight and height columns, we will impute later.

Finally, we inspect and spread the column `abilities`.

In [39]:
pokemon.abilities.sample(5)

791                                  ['Full Metal Body']
295                 ['Thick Fat', 'Guts', 'Sheer Force']
575               ['Frisk', 'Competitive', 'Shadow Tag']
630            ['Gluttony', 'Flash Fire', 'White Smoke']
19     ['Run Away', 'Guts', 'Hustle', 'Gluttony', 'Hu...
Name: abilities, dtype: object

In [44]:
max_num_ab = pokemon['abilities'].str.count(",").max() + 1
max_num_ab

np.int64(6)

In [45]:
pokemon.loc[pokemon['abilities'].str.count(",").idxmax(), ['name', 'abilities']]

name                                                   Rattata
abilities    ['Run Away', 'Guts', 'Hustle', 'Gluttony', 'Hu...
Name: 18, dtype: object

Pokemon have up to 6 abilities. We spread the column `abilities` across 6 columns.

In [48]:
l = [eval(s) for s in pokemon['abilities']]
d = {}
for i in range(max_num_ab):
    li = []
    for e in range(len(l)):
        if len(l[e]) >= i+1:
            li.append(l[e][i])
        else:
            li.append('None')
    d['ability_'+str(i)] = li

In [49]:
abilities = pd.DataFrame(d)
abilities[abilities == 'None'] = None
abilities

Unnamed: 0,ability_0,ability_1,ability_2,ability_3,ability_4,ability_5
0,Overgrow,Chlorophyll,,,,
1,Overgrow,Chlorophyll,,,,
2,Overgrow,Chlorophyll,,,,
3,Blaze,Solar Power,,,,
4,Blaze,Solar Power,,,,
...,...,...,...,...,...,...
797,Beast Boost,,,,,
798,Beast Boost,,,,,
799,Beast Boost,,,,,
800,Prism Armor,,,,,


In [50]:
pokemon = pd.concat([pokemon, abilities], axis = 1)
pokemon.drop('abilities', axis = 1, inplace = True)
pokemon.columns

Index(['against_bug', 'against_dark', 'against_dragon', 'against_electric',
       'against_fairy', 'against_fight', 'against_fire', 'against_flying',
       'against_ghost', 'against_grass', 'against_ground', 'against_ice',
       'against_normal', 'against_poison', 'against_psychic', 'against_rock',
       'against_steel', 'against_water', 'attack', 'base_egg_steps',
       'base_happiness', 'base_total', 'capture_rate', 'classification',
       'defense', 'experience_growth', 'height_m', 'hp', 'name',
       'percentage_male', 'pokedex_number', 'sp_attack', 'sp_defense', 'speed',
       'type1', 'type2', 'weight_kg', 'generation', 'is_legendary',
       'percentage_female', 'ability_0', 'ability_1', 'ability_2', 'ability_3',
       'ability_4', 'ability_5'],
      dtype='object')

By domain knowledge we know that all the `against_` columns are actually redundant, as the type resistences are determined only by `type1` and `type2`. Hence we remove them.

In [51]:
pokemon = pokemon.loc[:, np.invert(pokemon.columns.str.startswith('against_'))]

We also remove `base_total`:

In [52]:
(pokemon.base_total == pokemon.attack + pokemon.defense +\
pokemon.sp_attack + pokemon.sp_defense + pokemon.speed + pokemon.hp).all()

np.True_

In [53]:
pokemon.drop('base_total', axis = 1, inplace = True)
pokemon.columns

Index(['attack', 'base_egg_steps', 'base_happiness', 'capture_rate',
       'classification', 'defense', 'experience_growth', 'height_m', 'hp',
       'name', 'percentage_male', 'pokedex_number', 'sp_attack', 'sp_defense',
       'speed', 'type1', 'type2', 'weight_kg', 'generation', 'is_legendary',
       'percentage_female', 'ability_0', 'ability_1', 'ability_2', 'ability_3',
       'ability_4', 'ability_5'],
      dtype='object')

We change the order of the columns in a more appropriate way.

In [54]:
cols = ['pokedex_number', 'name', 'type1', 'type2', 'generation', 'hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed']
cols.extend(list(pokemon.columns[np.invert(pokemon.columns.isin(cols))]))
pokemon = pokemon.reindex(columns = cols)
pokemon.head(5)

Unnamed: 0,pokedex_number,name,type1,type2,generation,hp,attack,defense,sp_attack,sp_defense,...,percentage_male,weight_kg,is_legendary,percentage_female,ability_0,ability_1,ability_2,ability_3,ability_4,ability_5
0,1,Bulbasaur,grass,poison,1,45,49,49,65,65,...,88.1,6.9,0,11.9,Overgrow,Chlorophyll,,,,
1,2,Ivysaur,grass,poison,1,60,62,63,80,80,...,88.1,13.0,0,11.9,Overgrow,Chlorophyll,,,,
2,3,Venusaur,grass,poison,1,80,100,123,122,120,...,88.1,100.0,0,11.9,Overgrow,Chlorophyll,,,,
3,4,Charmander,fire,none,1,39,52,43,60,50,...,88.1,8.5,0,11.9,Blaze,Solar Power,,,,
4,5,Charmeleon,fire,none,1,58,64,58,80,65,...,88.1,19.0,0,11.9,Blaze,Solar Power,,,,


In [63]:
pokemon.to_csv("../data/pokemon_cleaned.csv")