In [None]:
!pip install pyclustertend

# Pokemon Dataset

In [None]:
## Libraries

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px # dataviz animée du turfu

from pyclustertend import vat

%config InlineBackend.figure_format = 'retina'

In [None]:
pokemon_df = pd.read_csv('/kaggle/input/pokemon/pokemon.csv')

In [None]:
pokemon_df.head()

## Memory consumption

In [None]:
pokemon_df.info()

In [None]:
pokemon_df.is_legendary.value_counts()

legendary is a boolean column, but it is mapped as a int64; it can easily be mapped as a int8 and by the way decrease memory consumption

In [None]:
pokemon_df.is_legendary.memory_usage()

In [None]:
pokemon_df.is_legendary.astype('int8').memory_usage()

In [None]:
 929 / 6536 

the new representation uses only 14% of what was used by the series

In [None]:
pokemon_df.is_legendary.astype('category').memory_usage()

the category type can be very useful but not on this case

In [None]:

int_columns = pokemon_df.select_dtypes(include=['int']).columns.tolist()
pokemon_df[int_columns] = pokemon_df[int_columns].apply(pd.to_numeric, downcast='integer')

In [None]:
pokemon_df.info()

we divide by 2 memory consumption

In [None]:
pokemon_df.at[0,'is_legendary']

In [None]:
pokemon_df.at[0,'is_legendary'] = 300
pokemon_df.at[0,'is_legendary']

In [None]:
pokemon_df.info()

Pandas did not recast the serie when we introduce a number greater than 255. It is a risk to play with type in pandas

In [None]:
pokemon_df.at[0,'is_legendary'] = 0

## Visualisation

In [None]:
y = pokemon_df['is_legendary']
X = pd.get_dummies(pokemon_df.drop('is_legendary', axis=1).dropna(axis='columns'))

In [None]:
X = X.to_numpy()

In [None]:
y = y.to_numpy()

## VAT

In [None]:
vat(X)

as we can see, it seems we have a hierarchy of **7 clusters** in this dataset.

## plotly express

In [None]:
fig = px.scatter(pokemon_df, 
                 x="attack", 
                 y="defense", 
                 color="type1", 
                 marginal_y="violin", 
                 marginal_x="histogram",
                 hover_name="name",
                 title="Attaque et défense des pokémons en fonction de leur type1")
fig.show()

In [None]:
fig = px.parallel_categories(pokemon_df,
                             dimensions=['type1', 'type2', 'is_legendary'],
                             title="Nombre de pokémons par type 1, type 2 et légendarité")
fig.show()

In [None]:
fig = px.violin(pokemon_df, 
                y="speed",  
                color="generation", 
                box=True, 
                points="all", 
                hover_data=pokemon_df.columns,
                title='Distribution des vitesses des pokémons par génération')
fig.show()