# Pokemon Data - Web Scraping & Exploratory Analysis

## Exercise 1

### Step 1.1

In [158]:
#Importing BeautifulSoup, requests and pandas
from bs4 import BeautifulSoup
import requests
from pandas import Series

#Scraping required table from pokedex URL and storing in list
url = 'https://pokemondb.net/pokedex/all'
response = requests.get(url)
data = response.text
pokemon_soup = BeautifulSoup(data, 'html')
pokemon_table = pokemon_soup.find(name = 'table', id = 'pokedex')
pokedex_list = pokemon_table.find_all("tr")

#Total number of rows in pokedex table including header row
pok_num = len(pokedex_list)
print("Number of rows in Pokedex table = " + str(pok_num))

Number of rows in Pokedex table = 927


### Step 1.2

In [159]:
#Extracting all the required data for "Bulbesaur"
bulb = pokedex_list[1]

#Extracting name
name = bulb.find(name = 'td', class_ = 'cell-name').text
print("Name of Pokemon is " + str(name))

#Extracting URL
bulb_url = bulb.find('a').get('href')
print("URL of Bulbesaur Pokemon is " + str('http://pokemondb.net') + str(bulb_url))

#Extracting type of pokemon
bulb_type = bulb.find(name = 'td', class_='cell-icon').text
print("Type(s) of Pokemon is:" + str(bulb_type))

#Extracting total points
bulb_points = bulb.find(name = 'td', class_ = 'cell-total').text
print("Total points of Bulbesaur = " + str(bulb_points))

#Extracting all other values in a list 
num = bulb.find_all(name = 'td', class_ ='cell-num')
List = []
for l in num:
    List.append(l.text)
print(List)

Name of Pokemon is Bulbasaur
URL of Bulbesaur Pokemon is http://pokemondb.net/pokedex/bulbasaur
Type(s) of Pokemon is:Grass Poison
Total points of Bulbesaur = 318
['001', '45', '49', '49', '65', '65', '45']


### Step 1.3

In [160]:
#Defining function for extracting all details for any pokemon from pokedex table
import pandas as pd
import numpy as np

def pokedex(num):
    pok = pokedex_list[num]
    name = pok.find(name = 'td', class_ = 'cell-name').text
    #print("Name of Pokemon is " + str(name))
    url = str('http://pokemondb.net') + pok.find('a').get('href')
    #print("URL of Pokemon is " + str('http://pokemondb.net') + str(url))
    pok_type = pok.find(name = 'td', class_='cell-icon').text
    #print("Type(s) of Pokemon is:" + str(pok_type))
    points = pok.find(name = 'td', class_ = 'cell-total').text
    #print("Total points = " + str(points))
    num = pok.find_all(name = 'td', class_ ='cell-num')
    List = []
    for l in num:
        List.append(l.text)
    #print(List)
    rows = [name, url, pok_type, points]
    for i in List:
        rows.append(i)
    #print(rows)
    np_rows = np.array([rows])
    df = pd.DataFrame(np_rows, columns = ['Name', 'URL', 'Type', 'Total', 'ID', 'HP', 'Attack', 'Defence', 'Sp. Atk', 'Sp. Def', 'Speed'])
    return df
    
    

In [161]:
#Calling function for 10th row
pokedex(10)

Unnamed: 0,Name,URL,Type,Total,ID,HP,Attack,Defence,Sp. Atk,Sp. Def,Speed
0,Squirtle,http://pokemondb.net/pokedex/squirtle,Water,314,7,44,48,65,50,64,43


In [162]:
#Creating list and appending to create dataframe of all pokemons
row_list = [i for i in range(1,pok_num)]
dfs = []
for i in row_list:
    pok_row = pokedex(i)
    dfs.append(pok_row)
pok_df = pd.concat(dfs)
pok_df.head(10)

Unnamed: 0,Name,URL,Type,Total,ID,HP,Attack,Defence,Sp. Atk,Sp. Def,Speed
0,Bulbasaur,http://pokemondb.net/pokedex/bulbasaur,Grass Poison,318,1,45,49,49,65,65,45
0,Ivysaur,http://pokemondb.net/pokedex/ivysaur,Grass Poison,405,2,60,62,63,80,80,60
0,Venusaur,http://pokemondb.net/pokedex/venusaur,Grass Poison,525,3,80,82,83,100,100,80
0,Venusaur Mega Venusaur,http://pokemondb.net/pokedex/venusaur,Grass Poison,625,3,80,100,123,122,120,80
0,Charmander,http://pokemondb.net/pokedex/charmander,Fire,309,4,39,52,43,60,50,65
0,Charmeleon,http://pokemondb.net/pokedex/charmeleon,Fire,405,5,58,64,58,80,65,80
0,Charizard,http://pokemondb.net/pokedex/charizard,Fire Flying,534,6,78,84,78,109,85,100
0,Charizard Mega Charizard X,http://pokemondb.net/pokedex/charizard,Fire Dragon,634,6,78,130,111,130,85,100
0,Charizard Mega Charizard Y,http://pokemondb.net/pokedex/charizard,Fire Flying,634,6,78,104,78,159,115,100
0,Squirtle,http://pokemondb.net/pokedex/squirtle,Water,314,7,44,48,65,50,64,43


## Exercise 2

### Step 2.1

In [163]:
#Column names were added during creation of dataframe in previous step
#Setting ID as index and first column
pok_df.set_index('ID').head(10)

Unnamed: 0_level_0,Name,URL,Type,Total,HP,Attack,Defence,Sp. Atk,Sp. Def,Speed
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Bulbasaur,http://pokemondb.net/pokedex/bulbasaur,Grass Poison,318,45,49,49,65,65,45
2,Ivysaur,http://pokemondb.net/pokedex/ivysaur,Grass Poison,405,60,62,63,80,80,60
3,Venusaur,http://pokemondb.net/pokedex/venusaur,Grass Poison,525,80,82,83,100,100,80
3,Venusaur Mega Venusaur,http://pokemondb.net/pokedex/venusaur,Grass Poison,625,80,100,123,122,120,80
4,Charmander,http://pokemondb.net/pokedex/charmander,Fire,309,39,52,43,60,50,65
5,Charmeleon,http://pokemondb.net/pokedex/charmeleon,Fire,405,58,64,58,80,65,80
6,Charizard,http://pokemondb.net/pokedex/charizard,Fire Flying,534,78,84,78,109,85,100
6,Charizard Mega Charizard X,http://pokemondb.net/pokedex/charizard,Fire Dragon,634,78,130,111,130,85,100
6,Charizard Mega Charizard Y,http://pokemondb.net/pokedex/charizard,Fire Flying,634,78,104,78,159,115,100
7,Squirtle,http://pokemondb.net/pokedex/squirtle,Water,314,44,48,65,50,64,43


In [164]:
#Changing datatypes to numeric for appropriate columns
pok_df[["Total", "ID", "HP", "Attack", "Defence", "Sp. Atk", "Sp. Def", "Speed"]] = pok_df[["Total", "ID", "HP", "Attack", "Defence", "Sp. Atk", "Sp. Def", "Speed"]].apply(pd.to_numeric)
pok_df.dtypes

Name       object
URL        object
Type       object
Total       int64
ID          int64
HP          int64
Attack      int64
Defence     int64
Sp. Atk     int64
Sp. Def     int64
Speed       int64
dtype: object

### Step 2.2

In [165]:
#Creating 18 dummy variables for each type of pokemon
dum = pok_df['Type'].str.get_dummies(sep=' ')
pok_df = pd.concat([pok_df,dum], axis = 1).set_index('ID')
pok_df.head(10)

Unnamed: 0_level_0,Name,URL,Type,Total,HP,Attack,Defence,Sp. Atk,Sp. Def,Speed,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Bulbasaur,http://pokemondb.net/pokedex/bulbasaur,Grass Poison,318,45,49,49,65,65,45,...,0,1,0,0,0,1,0,0,0,0
2,Ivysaur,http://pokemondb.net/pokedex/ivysaur,Grass Poison,405,60,62,63,80,80,60,...,0,1,0,0,0,1,0,0,0,0
3,Venusaur,http://pokemondb.net/pokedex/venusaur,Grass Poison,525,80,82,83,100,100,80,...,0,1,0,0,0,1,0,0,0,0
3,Venusaur Mega Venusaur,http://pokemondb.net/pokedex/venusaur,Grass Poison,625,80,100,123,122,120,80,...,0,1,0,0,0,1,0,0,0,0
4,Charmander,http://pokemondb.net/pokedex/charmander,Fire,309,39,52,43,60,50,65,...,0,0,0,0,0,0,0,0,0,0
5,Charmeleon,http://pokemondb.net/pokedex/charmeleon,Fire,405,58,64,58,80,65,80,...,0,0,0,0,0,0,0,0,0,0
6,Charizard,http://pokemondb.net/pokedex/charizard,Fire Flying,534,78,84,78,109,85,100,...,0,0,0,0,0,0,0,0,0,0
6,Charizard Mega Charizard X,http://pokemondb.net/pokedex/charizard,Fire Dragon,634,78,130,111,130,85,100,...,0,0,0,0,0,0,0,0,0,0
6,Charizard Mega Charizard Y,http://pokemondb.net/pokedex/charizard,Fire Flying,634,78,104,78,159,115,100,...,0,0,0,0,0,0,0,0,0,0
7,Squirtle,http://pokemondb.net/pokedex/squirtle,Water,314,44,48,65,50,64,43,...,0,0,0,0,0,0,0,0,0,1


### Step 2.3

In [166]:
#Creating copy of original dataframe to use later
pok_dfc = pok_df.copy()
#Removing duplicate values of pokemon using drop_duplicates
pok_df.drop_duplicates(subset = "URL", inplace = True)
rows = pok_df["Name"].count()
print("Number of rows in dedpuplicated dataframe = " + str(rows))

Number of rows in dedpuplicated dataframe = 809


### Step 2.4

In [167]:
#Creating sample of pokemon with every 4th pokemon being tagged "true"
pok_df['Sample'] = pok_df.index % 4 == 0
pok_df.head(10)

Unnamed: 0_level_0,Name,URL,Type,Total,HP,Attack,Defence,Sp. Atk,Sp. Def,Speed,...,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water,Sample
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Bulbasaur,http://pokemondb.net/pokedex/bulbasaur,Grass Poison,318,45,49,49,65,65,45,...,1,0,0,0,1,0,0,0,0,False
2,Ivysaur,http://pokemondb.net/pokedex/ivysaur,Grass Poison,405,60,62,63,80,80,60,...,1,0,0,0,1,0,0,0,0,False
3,Venusaur,http://pokemondb.net/pokedex/venusaur,Grass Poison,525,80,82,83,100,100,80,...,1,0,0,0,1,0,0,0,0,False
4,Charmander,http://pokemondb.net/pokedex/charmander,Fire,309,39,52,43,60,50,65,...,0,0,0,0,0,0,0,0,0,True
5,Charmeleon,http://pokemondb.net/pokedex/charmeleon,Fire,405,58,64,58,80,65,80,...,0,0,0,0,0,0,0,0,0,False
6,Charizard,http://pokemondb.net/pokedex/charizard,Fire Flying,534,78,84,78,109,85,100,...,0,0,0,0,0,0,0,0,0,False
7,Squirtle,http://pokemondb.net/pokedex/squirtle,Water,314,44,48,65,50,64,43,...,0,0,0,0,0,0,0,0,1,False
8,Wartortle,http://pokemondb.net/pokedex/wartortle,Water,405,59,63,80,65,80,58,...,0,0,0,0,0,0,0,0,1,True
9,Blastoise,http://pokemondb.net/pokedex/blastoise,Water,530,79,83,100,85,105,78,...,0,0,0,0,0,0,0,0,1,False
10,Caterpie,http://pokemondb.net/pokedex/caterpie,Bug,195,45,30,35,20,20,45,...,0,0,0,0,0,0,0,0,0,False


## Exercise 3

### Step 3.1

In [168]:
#Scraping Bulbesaur image in a general way using relevant tag
from IPython.display import Image

#Any pokemon image can be extracted using the corresponding ID 
url1 = pok_df['URL'][1]
response1 = requests.get(url1)
data1 = response1.text
pokemon_soup1 = BeautifulSoup(data1, 'html')
img_url = pokemon_soup1.find(name = 'a', rel = 'lightbox').get('href')
print(img_url)
Image(url = img_url)

https://img.pokemondb.net/artwork/large/bulbasaur.jpg


### Step 3.2

In [169]:
#Creating tables list containing all tables on Bulbesaur's page

tables = pd.read_html(requests.get(url1, headers={'User-agent': 'Mozilla/5.0'}).text)

#Extracting the location table for Bulbesaur
t = tables[-2]

#converting list to dataframe with location and URL as columns
pd.DataFrame(t)
t.columns = [url1, 'Location']
t

Unnamed: 0,http://pokemondb.net/pokedex/bulbasaur,Location
0,RedBlue,Pallet Town
1,Yellow,Cerulean City
2,GoldSilverCrystal,Trade/migrate from another game
3,RubySapphire,Trade/migrate from another game
4,FireRedLeafGreen,Pallet Town
5,Emerald,Trade/migrate from another game
6,DiamondPearlPlatinum,Trade/migrate from another game
7,HeartGoldSoulSilver,Pallet Town
8,BlackWhiteBlack 2White 2,Trade/migrate from another game
9,XY,Lumiose City


### Step 3.3

In [170]:
#Transposing the location table and setting URL as index
locat = t.set_index(url1).transpose()
locat.head()

http://pokemondb.net/pokedex/bulbasaur,RedBlue,Yellow,GoldSilverCrystal,RubySapphire,FireRedLeafGreen,Emerald,DiamondPearlPlatinum,HeartGoldSoulSilver,BlackWhiteBlack 2White 2,XY,Omega RubyAlpha Sapphire,SunMoon,Ultra SunUltra Moon,Let's Go PikachuLet's Go Eevee
Location,Pallet Town,Cerulean City,Trade/migrate from another game,Trade/migrate from another game,Pallet Town,Trade/migrate from another game,Trade/migrate from another game,Pallet Town,Trade/migrate from another game,Lumiose City,Trade/migrate from another game,Trade/migrate from another game,Route 2,"Cerulean City, Viridian Forest"


### Step 3.4

In [171]:
#Creating location dataframe with location tables values for all pokemons
Location = pd.DataFrame()
import re
import time

#Removing Meltan from Sample
pok_df.loc[808,'Sample'] = 'False'

#Selecting Sample dataframe to get the URLs of all pokemons in Sample
Sample = pok_df[pok_df["Sample"] == True]
Sample.set_index('URL')

#Repeating the same procedure as above for all pokemons to get location table, storing in a temporary dataframe t2 to transpose and appending to Location dataframe
for i in range(4,pok_num,4):
    try:
        url2 = Sample['URL'][i]
        tables = pd.read_html(requests.get(url2, headers={'User-agent': 'Mozilla/5.0'}).text)
        time.sleep(0.5)
        t1 = tables[-2]
        pd.DataFrame(t1)
        t1.columns = [url2, 'Location']
        t2 = t1.set_index(url2).transpose()
        t2['URL'] = url2
        Location = Location.append(t2, sort = True)
    except:
        pass

#Setting Index as URL 
Location = Location.set_index('URL')
Location.head(5)

Unnamed: 0_level_0,Alpha Sapphire,Black,Black 2,Black 2White 2,BlackWhite,BlackWhiteBlack 2,BlackWhiteBlack 2White 2,Blue,BlueYellow,Crystal,...,White,White 2,WhiteBlack 2White 2,X,XY,XYOmega RubyAlpha Sapphire,Y,YOmega Ruby,YOmega RubyAlpha Sapphire,Yellow
URL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://pokemondb.net/pokedex/charmander,,,,,,,Trade/migrate from another game,,,,...,,,,,Lumiose City,,,,,Route 24
http://pokemondb.net/pokedex/wartortle,,,,,,,Trade/migrate from another game,,,,...,,,,,Evolve Squirtle,,,,,
http://pokemondb.net/pokedex/butterfree,,Trade/migrate from another game,,Trade/migrate from another game,,,,,,"Route 2, 24, 25, Ilex Forest, National Park",...,Route 12,,,,Evolve Caterpie/Metapod,,,,,
http://pokemondb.net/pokedex/pidgey,,Trade/migrate from another game,,Trade/migrate from another game,,,,,,"Route 1, 2, 5, 25, 29, 30, 31, 32, 34, 35, 36,...",...,White Forest,,,,"Route 2, 3",,,,,"Route 1, 2, 5, 6, 7, 8, 11, 12, 13, 21, 24, 25..."
http://pokemondb.net/pokedex/raticate,,,,"Dreamyard, Relic Passage, Strange House",Dreamyard,,,,,"Route 1, 3, 4, 6, 7, 9, 10, 11, 21, 26, 27, 38...",...,,,,,Trade/migrate from another game,,,,,"Route 9, 10, 11, 16, 18, 21, Pokémon Mansion"


In [172]:
#Creating xy dataframe containing only pokemons with XY location
num = len(Location.index)
print(num)
newdf = pd.DataFrame(Location['XY'].dropna())

xy = newdf.XY.count()
print("Number of samples with XY location = " + str(xy))
newdf.head(5)

201
Number of samples with XY location = 141


Unnamed: 0_level_0,XY
URL,Unnamed: 1_level_1
http://pokemondb.net/pokedex/charmander,Lumiose City
http://pokemondb.net/pokedex/wartortle,Evolve Squirtle
http://pokemondb.net/pokedex/butterfree,Evolve Caterpie/Metapod
http://pokemondb.net/pokedex/pidgey,"Route 2, 3"
http://pokemondb.net/pokedex/raticate,Trade/migrate from another game


## Exercise 4

### Step 4.1

In [173]:
#Extracting average attack and defence values for each type of pokemon based on original dataframe poke_dex
tempdf = pd.DataFrame()

#Creating a lost of dummies
d1 = list()
for d in dum:
    d1.append(d)

#Melting copy of original dataframe created in step 2.3 as we need the deduplicated dataframe
df = pok_dfc.melt(id_vars = ['Attack','Defence'], value_vars = d1).set_index('variable')

#Grouping by type - named as variable here
df.groupby('variable')

#Filtering out types, by selecting only those columns that have value = 1
df1 = df[df['value']==1]

#calculating mean attack and defence for each type using pivot tables
atkdef_df = df1.pivot_table(index='variable', values = ['Attack', 'Defence'], aggfunc = 'mean')
atkdef_df.head(20)

Unnamed: 0_level_0,Attack,Defence
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
Bug,72.73494,72.168675
Dark,95.066667,69.216667
Dragon,103.47541,87.393443
Electric,73.741935,69.693548
Fairy,64.641509,73.113208
Fighting,105.825397,75.396825
Fire,83.527027,70.324324
Flying,80.345133,68.309735
Ghost,81.140351,82.561404
Grass,75.165138,73.752294


In [174]:
#Finding out pokemons types with minimum and maximum attack and defence respectively
print("Pokemon Type with highest average attack = " +str(atkdef_df.Attack.idxmax()))
print("Pokemon Type with lowest average attack = " +str(atkdef_df.Attack.idxmin()))
print("Pokemon Type with highest average defence = " +str(atkdef_df.Defence.idxmax()))
print("Pokemon Type with lowest average defence = " +str(atkdef_df.Defence.idxmin()))

Pokemon Type with highest average attack = Fighting
Pokemon Type with lowest average attack = Fairy
Pokemon Type with highest average defence = Steel
Pokemon Type with lowest average defence = Normal


### Step 4.2

In [175]:
#Merging XY dataframe with original dataframe by left join to retain only those pokemons that are there in XY dataframe but with all columns of original dataframe 
pokloc_df = pd.merge(newdf,pok_df, how = 'left', left_index = True, right_on = 'URL').set_index('URL')

#Group by 'XY' and finding mean of total points for each location
#Sorting to get the location with highest average total points
pokloc_df.groupby('XY').mean().sort_values('Total', ascending = False)['Total']

XY
Sea Spirit's Den, Roaming Kalos                                                                       580.000000
Evolve Vanillite/Vanillish                                                                            535.000000
Evolve Chespin/Quilladin                                                                              530.000000
Tower of Mastery                                                                                      525.000000
Evolve Mienfoo                                                                                        510.000000
Cyllage City                                                                                          510.000000
Evolve Gligar                                                                                         510.000000
Evolve Litleo                                                                                         507.000000
Evolve Scyther                                                                               

#### XY Location with highest average total points score is "Sea Spirit's Den, Roaming Kalos" with total points = 580.