In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cluster import KMeans

plt.rcParams["figure.figsize"] = (15, 7)
plt.style.use("ggplot")

## Introduction

I am an avid fan of pokemon, and so I decided to practice my EDA skills on this dataset. I was curious to see if I can group together pokemon based on their stats, and to find some interesting insights. 

For the best experience, please clone this notebook and run all. The interactive widgets below work only in an interactive session, and not for a commited notebook. 

In [None]:
FILE_PATH = "/kaggle/input/complete-pokemon-dataset-updated-090420/pokedex_(Update_05.20).csv"
df_pokemon = pd.read_csv(FILE_PATH)
df_pokemon = df_pokemon.drop("Unnamed: 0", axis=1)
df_pokemon.head(1)

In [None]:
INFO_CATEGORIES = ["pokedex_number", "name", "generation", "status", "type_1", "type_2"]
STATS_CATEGORIES = ["hp", "attack", "defense", "sp_attack", "sp_defense", "speed"]
df_pokemon = df_pokemon[INFO_CATEGORIES + STATS_CATEGORIES]
df_pokemon.head()

## EDA

First, some basic exploration. 

In [None]:
type_1_count = df_pokemon["type_1"].value_counts().sort_index() 
type_2_count = df_pokemon["type_2"].value_counts().sort_index()
type_count = (type_1_count + type_2_count).sort_values()

type_count.plot.barh()
_ = plt.title("Distribution of types"), plt.xlabel("Count")

In [None]:
def get_reflection(X):
    reflected = np.zeros(shape=(18, 18))

    for i in range(18):
        for j in range(18):
            reflected[j,i] = X[i,j]
    return reflected
    

In [None]:
type_1 = df_pokemon["type_1"]
type_2= df_pokemon[["type_1", "type_2"]].apply(
    lambda x : x.iloc[1] if not x.iloc[1] is np.nan else x.iloc[0],
    axis=1)
type_2.name = "type_2"

df_combo_count = pd.crosstab(type_1, type_2)

In [None]:
lower_triangle = np.tril(df_combo_count.to_numpy(), k=-1)
upper_triangle = np.triu(df_combo_count.to_numpy(), k=1)
diagonal = np.diag(np.diag(df_combo_count.to_numpy()))

upper_reflected = get_reflection(upper_triangle)

total_count = upper_reflected + lower_triangle
total_count = total_count + get_reflection(total_count)
total_count = total_count + diagonal

In [None]:
plt.imshow(total_count, cmap="OrRd")
plt.xticks(range(18), labels=list(df_combo_count.index))
plt.yticks(range(18), labels=list(df_combo_count.index))
_ = plt.xticks(rotation=90), plt.grid(b=None), plt.title("Type Combination HeatMap")

I wanted this to be a chord plot, however it does not show up on Kaggle. Any advice/ideas on why this is is welcome. 

In [None]:
df_pokemon[STATS_CATEGORIES].describe()

The most average pokemon is therefore a water type with:

HP  : 69

ATK : 80

DEF : 74

SPA : 72

SPD : 72

SPE : 68

which is a pokemon with a BST of 435.

## Clustering 

I'm interested if there are any "clusters" of pokemon that we can identify. We will be using PCA to first reduce the dimensions into 2 for visualization.

In [None]:
from IPython.display import Image, display, HTML
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import plotly.graph_objects as go


pca = PCA(n_components=2)
pca.fit(df_pokemon[STATS_CATEGORIES])

In [None]:
import plotly.graph_objects as go

def plot_by_type_interactive(Type, Stat):
    
    if Type != "All":
        filter_ = (df_pokemon["type_1"] == Type) | (df_pokemon["type_2"] == Type)
        df = df_pokemon.loc[filter_]
    else:
        df = df_pokemon

    X = pca.transform(df[STATS_CATEGORIES])
    
    fig = go.Figure(go.Scatter(x=X[:, 0], 
                               y=X[:, 1],
                              mode="markers",
                              text=df["name"],
                              marker={"color": df[Stat], "showscale": True, "colorscale": "solar"}))

    fig.update_layout(title=f"Pokemon Stats PCA Visualization {Type} Type, by {Stat}")
    fig.show()

_ = interact(plot_by_type_interactive, Type = ["All"] + list(df_pokemon["type_1"].unique()), Stat = STATS_CATEGORIES)

In [None]:
def pokedex(Pokemon):
    
    poke_list = Pokemon.lower().split(", ")
    display(df_pokemon[df_pokemon["name"].apply(lambda x : x.lower() in poke_list)])
    
    
_ = interact_manual(pokedex, Pokemon = widgets.Text(value="Bulbasaur"))

Based on some emperical observations, it seems that in the PCA, the X axis is correlated to attack and special attack, while the Y axis represents how defensely oriented it is (with a higher value being more defensely oriented, and a lower value being more offensively oriented). One can also view the Y axis as how speedy the pokemon is. There might be a correlation between speed and defense, so we'll investigate that in the future. 

Also, note the outlier, which is Eternatus (Eternamax), an insane boss pokemon added in generation 8.
I will continue the clustering in a future date. 

In [None]:
fig = go.Figure(go.Scatter(x=df_pokemon["sp_defense"] + df_pokemon["defense"], 
                          y=df_pokemon["speed"], 
                          text=df_pokemon["name"], mode="markers"))
fig.update_layout(xaxis_title="defense + sp_defense", yaxis_title="speed", title="Correlation between speed and defense")
fig.show()

Most pokemon seem to be clustered near the center. It might be interesting to see if there is a notable correlation if we seperated the pokemon into different groups, maybe by competitive smogon tiering. 

## Clustering

Next, I'm curious to see if we can cluster the pokemon based on their stats well enough. 

In [None]:
from sklearn.metrics import silhouette_score



sil_scores = []
inertias = []
for clusters in range(2, 21):
    clusterer = KMeans(n_clusters=clusters, random_state=42)
    labels = clusterer.fit_predict(df_pokemon[STATS_CATEGORIES])
    sil_scores.append(silhouette_score(df_pokemon[STATS_CATEGORIES], labels))
    inertias.append(clusterer.inertia_)

In [None]:

fig, ax = plt.subplots(1, 2)
ax[0].plot(range(2, 21), sil_scores)
ax[1].plot(range(2, 21), inertias)

ax[0].set_xlim([2, 20])
ax[1].set_xlim([2, 20])
ax[0].set_title("Silhouette Scores")
_ = ax[1].set_title("Inertia")


Based on the graphs above, we can see that the silhouette scores are quite low, and that there are no visible "elbows" to be found in the Inertias. We can therefore conclude that the pokemon are not clusterable. 