# Shannon's EDA
Explore and understand the Pokémon dataset.

In [2]:
# import dependencies
import numpy as np
import pandas as pd

# set options
pd.set_option("display.precision", 2)

In [3]:
# read in dataset
df_raw = pd.read_csv("../datasets/Pokemon.csv")

df_raw.head()

Unnamed: 0,Pokedex No.,Name,Type,Other Type,HP,Attack,Defense,Special Attack,Special Defense,Speed,Total,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,318,1,0
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,405,1,0
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,525,1,0
3,3,Mega Venusaur,Grass,Poison,80,100,123,122,120,80,625,1,0
4,4,Charmander,Fire,,39,52,43,60,50,65,309,1,0


There are 1045 rows and 13 columns in this dataset.

In [4]:
print(df_raw.shape)

(1045, 13)


Names of the 13 columns:

In [5]:
print(df_raw.columns)

Index(['Pokedex No.', 'Name', 'Type', 'Other Type', 'HP', 'Attack', 'Defense',
       'Special Attack', 'Special Defense', 'Speed', 'Total', 'Generation',
       'Legendary'],
      dtype='object')


The only column with null values is `Other Type`. Approximately half of the Pokemons have two types (i.e., there are values in both `Type` and `Other Type`).

Columns that may need to be removed because they're not stats include:
* Pokedex No.
* Name
* Generation

Additional information that may be useful in making predictions:
* number of evolutions
* separate variable for the different forms of the same pokemon

In [6]:
print(df_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1045 entries, 0 to 1044
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Pokedex No.      1045 non-null   int64 
 1   Name             1045 non-null   object
 2   Type             1045 non-null   object
 3   Other Type       553 non-null    object
 4   HP               1045 non-null   int64 
 5   Attack           1045 non-null   int64 
 6   Defense          1045 non-null   int64 
 7   Special Attack   1045 non-null   int64 
 8   Special Defense  1045 non-null   int64 
 9   Speed            1045 non-null   int64 
 10  Total            1045 non-null   int64 
 11  Generation       1045 non-null   int64 
 12  Legendary        1045 non-null   int64 
dtypes: int64(10), object(3)
memory usage: 106.3+ KB
None


In [7]:
df_raw.describe()

Unnamed: 0,Pokedex No.,HP,Attack,Defense,Special Attack,Special Defense,Speed,Total,Generation,Legendary
count,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0
mean,440.77,70.07,80.47,74.66,73.02,72.29,68.81,439.31,4.06,0.12
std,262.52,26.67,32.41,31.24,32.72,28.07,30.21,121.97,2.26,0.33
min,1.0,1.0,5.0,5.0,10.0,20.0,5.0,175.0,1.0,0.0
25%,212.0,50.0,55.0,50.0,50.0,50.0,45.0,330.0,2.0,0.0
50%,436.0,68.0,77.0,70.0,65.0,70.0,65.0,458.0,4.0,0.0
75%,670.0,82.0,100.0,90.0,95.0,90.0,90.0,515.0,6.0,0.0
max,898.0,255.0,190.0,250.0,194.0,250.0,200.0,1125.0,8.0,1.0


Unsurprisingly, Legendary Pokemons have higher stats on average. It is useful to note that not all Legendary Pokemons have higher stats than regular Pokemons.

In [9]:
print("Summary of Legendary Pokemons")
df_raw.loc[df_raw['Legendary'] == 1].describe()

Summary of Legendary Pokemons


Unnamed: 0,Pokedex No.,HP,Attack,Defense,Special Attack,Special Defense,Speed,Total,Generation,Legendary
count,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0
mean,603.08,95.19,111.5,98.52,111.04,99.51,97.72,613.48,5.09,1.0
std,229.79,31.82,31.46,32.48,34.05,32.0,29.36,94.98,2.15,0.0
min,144.0,43.0,29.0,20.0,29.0,20.0,13.0,200.0,1.0,1.0
25%,386.0,80.0,90.0,80.0,87.5,80.0,85.0,580.0,3.0,1.0
50%,646.0,92.0,107.0,95.0,110.0,98.0,97.0,600.0,5.0,1.0
75%,797.5,100.0,130.0,115.0,130.5,115.0,110.5,680.0,7.0,1.0
max,898.0,255.0,190.0,250.0,194.0,250.0,200.0,1125.0,8.0,1.0


In [10]:
print("Summary of non-Legendary Pokemons")
df_raw.loc[df_raw['Legendary'] == 0].describe()

Summary of non-Legendary Pokemons


Unnamed: 0,Pokedex No.,HP,Attack,Defense,Special Attack,Special Defense,Speed,Total,Generation,Legendary
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,418.31,66.59,76.17,71.36,67.76,68.52,64.81,415.22,3.92,0.0
std,258.94,23.9,30.14,29.6,28.85,25.29,28.08,104.48,2.24,0.0
min,1.0,1.0,5.0,5.0,10.0,20.0,5.0,175.0,1.0,0.0
25%,193.25,50.0,55.0,50.0,45.0,50.0,44.0,320.0,2.0,0.0
50%,409.5,65.0,73.0,66.0,63.0,65.0,62.0,430.0,4.0,0.0
75%,633.75,78.0,95.0,85.75,85.0,85.0,85.0,495.0,5.0,0.0
max,887.0,255.0,185.0,230.0,175.0,230.0,160.0,700.0,8.0,0.0


12% of Pokemons in the dataset are legendaries. This tells us that the dataset is quite imbalanced which will need to be addressed when creating the training dataset.

In [14]:
c = df_raw.Legendary.value_counts()
p = df_raw.Legendary.value_counts(normalize=True)
pd.concat([c,p], axis=1, keys=['counts', 'proportion'])

Unnamed: 0,counts,proportion
0,918,0.88
1,127,0.12
