I want to get a general idea of which type of Pokemons are stronger in terms of stats value.

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
%matplotlib inline

In [None]:
df = pd.read_csv('../input/Pokemon.csv')

I first look at the basic information about the data

In [None]:
def basicinfo(df):
    print(df.head())
    print(df.describe())
    print(df.info())
basicinfo(df)

Delete the redundant column and look at data again.

In [None]:
df = df.drop('#', axis = 1)
df.head()

Explore all stats values based according to the Type # 1 

In [None]:
features = ['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 
            'Sp. Def', 'Speed', 'Generation', 'Legendary']
def df_view(df, features):
    sns.set_style("whitegrid")
    fig, ax = plt.subplots(3,3, figsize = (16,20))
    k = 0
    for i in range(3):
        for j in range(3):
            sns.swarmplot(x = 'Type 1', y = features[k], data = df, 
                          ax = ax[i,j], palette = "hls", split = True)
            plt.setp(ax[i,j].get_xticklabels(), rotation = 90)
            k += 1 
    return(fig)

It barely shows patterns between each type. It may be more interesting to see how spec correlates with each other.

In [None]:
p = df_view(df, features)

The obvious negative correlation is between Defense and Speed, which makes sense. Pokemons which have higher defense often have lower speed.

In [None]:
features_spec = ['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
corr = df[features_spec].corr()
plt.figure(figsize = (8,8))
sns.heatmap(corr, annot = True)

I then applied PCA for better visualization of the data.

In [None]:
def scalePCA(df, features):
    scale = StandardScaler()
    df_pca = scale.fit_transform(df[features])
    pca = PCA(n_components=2)
    df_pca = pd.DataFrame(pca.fit_transform(df_pca))
    df_pca.columns = ['PC1', 'PC2']
    df_final = pd.concat([df, df_pca], axis = 1)
    return(df_final)

In [None]:
df_pca = scalePCA(df, features_spec)
df_pca.head()

I plotted PCA figure based on Type #1

In [None]:
plt.figure(figsize = (8,8))
sns.set_style("whitegrid")
sns.lmplot(x = 'PC1', y = 'PC2', data = df_pca, hue = 'Type 1', fit_reg = False)

Looks interesting, but there is still no obvious patterns between types after PCA analysis.
However, as what we expect, legendary Pokemons are quite distinct from normal Pokemons.

In [None]:
plt.figure(figsize = (8,8))
sns.set_style("whitegrid")
sns.lmplot(x = 'PC1', y = 'PC2', data = df_pca, hue = 'Legendary', fit_reg = False)

I would like to see whether there are distinct groups in PCA plot by KMeans.
It looks like 2-4 groups give a better clustering results.

In [None]:
km_score = {}
for n in range(2,10):
    km = KMeans(n_clusters = n)
    km_pca = km.fit(df_pca[['PC1', 'PC2']])
    cluster_labels = km_pca.predict(df_pca[['PC1', 'PC2']])
    silhouette_avg = silhouette_score(df_pca[['PC1', 'PC2']], cluster_labels)
    km_score[n] = silhouette_avg
km_score

Since group numbers 2 to 4 have better scores, I will like to explore more from here.
Although two groups are separated well, it is not informative enough.

In [None]:
km2 = KMeans(n_clusters = 2)
km2_pca = km2.fit(df_pca[['PC1', 'PC2']])
df_pca_2 = pd.concat([df_pca, pd.DataFrame(km2_pca.labels_)], axis = 1)
df_pca_2.rename(columns = {0:'kmeans'}, inplace = True)
plt.figure(figsize = (8,8))
sns.set_style("whitegrid")
sns.lmplot(x = 'PC1', y = 'PC2', data = df_pca_2, hue = 'kmeans', fit_reg = False)

It getting more interesting to separate the groups into 3.

In [None]:
km3 = KMeans(n_clusters = 3)
km3_pca = km3.fit(df_pca[['PC1', 'PC2']])
df_pca_3 = pd.concat([df_pca, pd.DataFrame(km3_pca.labels_)], axis = 1)
df_pca_3.rename(columns = {0:'kmeans'}, inplace = True)
plt.figure(figsize = (8,8))
sns.set_style("whitegrid")
sns.lmplot(x = 'PC1', y = 'PC2', data = df_pca_3, hue = 'kmeans', fit_reg = False)

Now we can see that there is one group with high stats in general (red color in heatmap), 
one group with low stats in general (blue in heatmap), and another group in between (closed to white color in heatmap).

In [None]:
df_pca_3_mean = df_pca_3.groupby('kmeans').mean()
df_pca_3_mean[features_spec] = StandardScaler().fit_transform(df_pca_3_mean[features_spec])
sns.heatmap(df_pca_3_mean[features_spec], center = 0, annot = True)

Let's go for separating into 4 groups.

In [None]:
km4 = KMeans(n_clusters = 4)
km4_pca = km4.fit(df_pca[['PC1', 'PC2']])
df_pca_4 = pd.concat([df_pca, pd.DataFrame(km4_pca.labels_)], axis = 1)
df_pca_4.rename(columns = {0:'kmeans'}, inplace = True)
plt.figure(figsize = (8,8))
sns.set_style("whitegrid")
sns.lmplot(x = 'PC1', y = 'PC2', data = df_pca_4, hue = 'kmeans', fit_reg = False)

Now we got 4 groups.

 1. One is high stats in general (red color in heatmap),
 2. One is low stats in general (blue color in heatmap),
 3. One is high stats in defense but low in speed and,
 4. One is low in defense but high in speed.

In [None]:
df_pca_4_mean = df_pca_4.groupby('kmeans').mean()
df_pca_4_mean.head()
df_pca_4_mean[features_spec] = StandardScaler().fit_transform(df_pca_4_mean[features_spec])
sns.heatmap(df_pca_4_mean[features_spec], center = 0, annot = True)

I want to know which type of pokemons are usually stronger. We can see that most of the pokemons are type Grass, Fire, Water, Bug, and Normal. Type Flying and Fairy are much fewer.

In [None]:
plt.figure(figsize = (12,6))
sns.countplot(x = 'Type 1', hue = 'kmeans', data = df_pca_4)
plt.xticks(rotation=90)

Thus, it may be better to look at the ratio of each kmean group within each type.
(I was having trouble to do the following calculation and plot, there must be an elegant way to do this.)

In [None]:
df_pca_count = df_pca_4.groupby(['Type 1', 'kmeans']).count()
df_pca_sum = df_pca_4.groupby('Type 1').count()
df_pca_final = df_pca_count.div(df_pca_sum, level = 'Type 1') * 100
df_unstack = pd.DataFrame(df_pca_final['Attack'])
df_unstack = pd.DataFrame(df_unstack.unstack(['kmeans', 'Type 1']))
df_unstack = df_unstack.reset_index()
df_unstack.drop('level_0', axis = 1, inplace = True)
df_unstack.columns = ['kmeans', 'Type 1', 'Ratio']

Now we can have some conclusions here.

 1. The majority percentage of pokemons in most of the types are with low stats in general.
 2. Rock and Steel types of pokemons often have higher defense.
 3. Dragon and Flying types of pokemons often have higer stats in general.
 4. Fire and Electric types often are stronger in attack.

In [None]:
plt.figure(figsize = (12,6))
sns.barplot(x = 'Type 1', y = 'Ratio', hue = 'kmeans', data = df_unstack)
plt.xticks(rotation=90)