In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table of Contents

* [Introduction and objectives](#introduction)
* [Get familiar with the data](#familiar)
* [Data preprocessing](#preprocessing) 
    - [Deal with null values](#preprocessing-one)
    - [Deal with feature dropping](#preprocessing-two)
* [Visualizations](#visualization)
    - [Countplot](#count)
    - [Scatterplot & Histplot](#scatter)
    - [Heatmap](#heatmap)
    - [Boxplot](#box)
* [Conclusion](#conclusion)

***Written by:*** *Fakhrul Hasbi*

<a id="introduction"></a>
## Introduction

As a brief context, this dataset revolves around Pokemon Games and is a collection of 721 Pokemons.

## Objectives

Data exploration by visualizing the data.

<a id="familiar"></a>
## Get familiar with the data 

*-> **Importing** necessary libraries*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_context("notebook")
plt.style.use('fivethirtyeight')
%matplotlib inline

*-> **Read** the csv file*

In [None]:
df = pd.read_csv("/kaggle/input/pokemon/Pokemon.csv")

*-> **Quick checking** the dataframe*

In [None]:
df.head(5)

In [None]:
df.info()

<a id="preprocessing"></a>
## Data preprocessing 

<a id="preprocessing-one"></a>
### Deal with null values 

*-> **Checking null values.** If exists, then need to be removed.*

In [None]:
# make a heatmap to visualize the missing values
sns.heatmap(df.isnull(), cbar=False, yticklabels=False)

In [None]:
# check the unique values of Type 2 column to determine the imputation strategy or just drop it.
df["Type 2"].unique()
# Perhaps, it might legitimate to just convert NaN values to 'None' so there will be pokemons that do not have type 2.

*-> **Impute Type 2 column** from NaN to 'None'*

In [None]:
# Impute Type 2 column
df["Type 2"] = df["Type 2"].apply(lambda tip: 'None' if pd.isna(tip) else tip)

In [None]:
# last checking to make sure that there are no more missing values in features
df.isnull().sum()

<a id="preprocessing-three"></a>
### Deal with feature dropping

*-> **Drop the # column** since it is not important*

In [None]:
# drop # column since it is not necessary
df.drop("#", axis=1, inplace=True)

<a id="visualization"></a>
## Visualizations

<a id="count"></a>
### Countplot 

*-> **Visualization** using countplot. Refer to the comment for further detail.*

In [None]:
# distribution of Type 1 with hue: legendary
plt.figure(figsize=(20, 10))
sns.countplot(data=df, x="Type 1", hue="Legendary")
plt.legend(loc='best')

In [None]:
# distribution of Type 2 with hue: legendary
plt.figure(figsize=(20, 10))
sns.countplot(data=df, x="Type 2", hue="Legendary")
plt.legend(loc='best')

In [None]:
# distribution of Generation with hue: legendary
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x="Generation", hue="Legendary")
plt.legend(loc='best')

In [None]:
# distribution of legendary pokemon
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x="Legendary")

<a id="scatter"></a>
### Scatterplot & Histplot 

*-> **Visualization** using scatterplots & histplots or distplots.*

In [None]:
# make a quick overivew of plotting for non-categorical data
df_non_category = df[['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Legendary']]
sns.pairplot(df_non_category, hue='Legendary')

<a id="heatmap"></a>
### Heatmap

*-> **Visualization** using heatmap*

In [None]:
# make a quick overivew of correlation coefficient for non-categorical data
df_non_category = df[['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']]
plt.figure(figsize=(10,10))
sns.heatmap(df_non_category.corr(), cmap='magma', linecolor='black', linewidths=8, annot=True)

*-> **Visualization** using boxplot*

In [None]:
# checking the mean, max, and min difference across generation for non-legendary and legendary pokemons: all non-categorical features 
for col in df_non_category.columns:
    sns.catplot(data=df, x="Generation", y=col, col="Legendary", kind="box")

<a id="conclusion"></a>
## Conclusion

*-> **End of the notebook assignment:** based on the visualization, there are some insights we could get such as; the legendary pokemon much less than non-legendary pokemon, legendary pokemon possessed higher stats that non-legendary pokemons, etc.*