In [None]:
#############################################################
# In this notebook lets try to do a EDA on pokemon dataSet  #
# while at the same lets try get the essence hidden in this #
# data                                                      #
#                                                           #
# So lets begin "gotta catch em all"                        #
#############################################################

# Essential imports 
# as usual
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt 
import seaborn as sb



In [None]:
# loading dataFrame
dataFrame = pd.read_csv('/kaggle/input/pokmon-legendary-data/pokedex.csv')
dataFrame.info()

In [None]:
# for our sake of convinice lets interpolate
# the missing values using forwarfill method
dataFrame = dataFrame.fillna(method= 'ffill')
dataFrame.info()

In [None]:
# lets use attack, defense, height_m, hp, sp_attack
# sp_defense, speed, weight_kg as our features

# and generation, is_legendary, type as labels 
# for our Analysis

# But first lets do some EDA
genArr = ['Gen1', 'Gen2', 'Gen3', 'Gen4', 'Gen5', 'Gen6', 'Gen7']
numPokemonInGen = [len(dataFrame[dataFrame['generation'] == 1]),
                   len(dataFrame[dataFrame['generation'] == 2]),
                   len(dataFrame[dataFrame['generation'] == 3]),
                   len(dataFrame[dataFrame['generation'] == 4]),
                   len(dataFrame[dataFrame['generation'] == 5]),
                   len(dataFrame[dataFrame['generation'] == 6]),
                   len(dataFrame[dataFrame['generation'] == 7]),
                  ]

# plotting charts
# barCharts
plt.figure(figsize = (15, 5 ))
plt.subplot(1, 2, 1)
plt.bar(genArr, numPokemonInGen)
plt.title('Generationwise pokemon distribution')
plt.grid(True)

# plotting Pie chart for the same
plt.subplot(1, 2, 2)
plt.pie(numPokemonInGen, labels = genArr, autopct='%1.2f%%')
plt.title('Generationwise pokemon distribution (Pie)')

# from this chart we can conclude
# gen5 and gen1 introduces to most
# new pokemons

In [None]:
# now lets define that can give us
# type distribution of pokemons for
# each gen

def GenSegmentor (gen):
    types = list (set (dataFrame[dataFrame['generation'] == gen].type))
    valArr = []
    
    for mem in types:
        valArr.append( len(dataFrame[(dataFrame['generation'] == gen) & 
                                     (dataFrame['type'] == mem)]))
    
    # plotting charts
    # barCharts
    plt.figure(figsize = (20, 7 ))
    plt.subplot(1, 2, 1)
    plt.bar(types, valArr)
    plt.title('Bar Plot : type distribution Gen' + str(gen))
    plt.grid(True)
    

    # plotting Pie chart for the same
    plt.subplot(1, 2, 2)
    plt.pie(valArr, labels = types, autopct='%1.2f%%')
    plt.title('Pie Chart: type distribution Gen' + str(gen))
    
    

In [None]:
# For gen1 : it seems majority
# of Pokemons are water or normal type
GenSegmentor(1)

In [None]:
GenSegmentor(2)

In [None]:
GenSegmentor(3)

In [None]:
GenSegmentor(4)

In [None]:
GenSegmentor(5)

In [None]:
GenSegmentor(6)

In [None]:
GenSegmentor(7)

In [None]:
# now lets visualize relationship between pokemons
# by doing pca clustring on features

features = ['attack', 'defense', 'height_m', 'hp', 'sp_attack',
            'sp_defense', 'speed', 'weight_kg']

auxFrame = np.array (dataFrame[features])

# scalling available data
from sklearn.preprocessing import StandardScaler
stdc = StandardScaler()
auxFrame = stdc.fit_transform(auxFrame)

# doing Pca clusttering to reduce diminetionality
# to 2 principal components
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(auxFrame)
auxFrame = pca.transform(auxFrame)

In [None]:
# lets define a newdataFrame
dimReducedDataFrame = pd.DataFrame(auxFrame)
dimReducedDataFrame = dimReducedDataFrame.rename(columns = { 0: 'V1', 1 : 'V2'})
dimReducedDataFrame['type'] = dataFrame['type']
dimReducedDataFrame['generation'] = dataFrame['generation']
dimReducedDataFrame['is_legendary'] = dataFrame['is_legendary']

In [None]:
# with this new data let 
# plot some better charts

# lets see how pokemons are distribute 
# type wise
## Plotting this
plt.figure(figsize = (10, 5))
sb.scatterplot(data = dimReducedDataFrame, x = 'V1', y = 'V2',hue = 'type')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize = (10, 5))
sb.scatterplot(data = dimReducedDataFrame, x = 'V1', y = 'V2',hue = 'is_legendary')
plt.grid(True)
plt.show()

# From the plot given below
# legendaries can be easily 
# classified using SVM or other
# methods

In [None]:
plt.figure(figsize = (10, 5))
sb.scatterplot(data = dimReducedDataFrame, x = 'V1', y = 'V2',hue = 'generation')
plt.grid(True)
plt.show()

In [None]:
## lets try to the best pokemon for each generation
def BestInTheGen (gen):
    data = dataFrame[dataFrame['generation'] == gen]
    names = np.array(data['name'])
    data = np.array(data[features])
    
    maxScore = 0
    name = ''
    
    for i in range (len(data)):
        score = (sum(data[i]**2))**0.5
        
        if score > maxScore:
            maxScore = score
            name = names[i]

    return name, maxScore

In [None]:
BestInTheGen(1)

In [None]:
BestInTheGen(2)

In [None]:
BestInTheGen(3)

In [None]:
BestInTheGen(4)

In [None]:
BestInTheGen(5)

In [None]:
BestInTheGen(6)

In [None]:
BestInTheGen(7)

In [None]:
# let also find the worst Pokemon 
# each gen as well
def WorstInTheGen (gen):
    data = dataFrame[dataFrame['generation'] == gen]
    names = np.array(data['name'])
    data = np.array(data[features])
    
    maxScore = 2000 # highest score is 1027
    name = ''
    
    for i in range (len(data)):
        score = (sum(data[i]**2))**0.5
        
        if score < maxScore:
            maxScore = score
            name = names[i]

    return name, maxScore

In [None]:
WorstInTheGen(1)

In [None]:
WorstInTheGen(2)

In [None]:
WorstInTheGen(3)

In [None]:
WorstInTheGen(4)

In [None]:
WorstInTheGen(5)

In [None]:
WorstInTheGen(6)

In [None]:
WorstInTheGen(7)

In [None]:
###############################################
# Conclusion :                                #
# from this Eda we can conclude the best      #
# Pokemon is Celesteela                       #
# and sunkern is the worst pokemon            #
###############################################