## Web Scraping

Link to webpage: https://keithgalli.github.io/web-scraping/webpage.html

In [45]:
# Importing libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

In [69]:
# Loading the webpage content
url = 'https://pokemondb.net/pokedex/all'

r = requests.get(url)


In [70]:
# convert to a bs object
webpage = bs(r.content, 'html5lib')
print(webpage.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Pokémon Pokédex: list of Pokémon with stats | Pokémon Database
  </title>
  <link href="https://img.pokemondb.net" rel="preconnect"/>
  <style>
   @font-face{font-family:'Fira Sans';font-style:normal;font-weight:400;font-display:swap;src:url("/static/fonts/fira-sans-v10-latin-400.woff2") format("woff2");unicode-range:U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD}@font-face{font-family:'Fira Sans';font-style:italic;font-weight:400;font-display:swap;src:url("/static/fonts/fira-sans-v10-latin-400i.woff2") format("woff2");unicode-range:U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD}@font-face{font-family:'Fira Sans';font-style:normal;font-weight:700;font-display:swap;src:url("/static/fonts/fira-san

In [28]:
# grabbing the links using find
links = webpage.find('ul', class_ = 'socials')
links.text

'\n    Instagram: https://www.instagram.com/keithgalli/\n    Twitter: https://twitter.com/keithgalli\n    LinkedIn: https://www.linkedin.com/in/keithgalli/\n    TikTok: https://www.tiktok.com/@keithgalli\n  '

In [29]:
# grabbing the links using select

links = webpage.select('ul.socials a')
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [31]:
# referencing the href because that's where the links are

actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [33]:
ulist = webpage.find('ul',attrs = {"class": "socials"})
links = ulist.find_all("a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

## Scraping the table

In [88]:
table = webpage.find("table")
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]

table_rows = table.find("tbody").find_all('tr')
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)
print(l)

df = pd.DataFrame(l, columns = column_names)
df

[['001', 'Bulbasaur', 'Grass Poison', '318', '45', '49', '49', '65', '65', '45'], ['002', 'Ivysaur', 'Grass Poison', '405', '60', '62', '63', '80', '80', '60'], ['003', 'Venusaur', 'Grass Poison', '525', '80', '82', '83', '100', '100', '80'], ['003', 'Venusaur Mega Venusaur', 'Grass Poison', '625', '80', '100', '123', '122', '120', '80'], ['004', 'Charmander', 'Fire', '309', '39', '52', '43', '60', '50', '65'], ['005', 'Charmeleon', 'Fire', '405', '58', '64', '58', '80', '65', '80'], ['006', 'Charizard', 'Fire Flying', '534', '78', '84', '78', '109', '85', '100'], ['006', 'Charizard Mega Charizard X', 'Fire Dragon', '634', '78', '130', '111', '130', '85', '100'], ['006', 'Charizard Mega Charizard Y', 'Fire Flying', '634', '78', '104', '78', '159', '115', '100'], ['007', 'Squirtle', 'Water', '314', '44', '48', '65', '50', '64', '43'], ['008', 'Wartortle', 'Water', '405', '59', '63', '80', '65', '80', '58'], ['009', 'Blastoise', 'Water', '530', '79', '83', '100', '85', '105', '78'], ['00

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,001,Bulbasaur,Grass Poison,318,45,49,49,65,65,45
1,002,Ivysaur,Grass Poison,405,60,62,63,80,80,60
2,003,Venusaur,Grass Poison,525,80,82,83,100,100,80
3,003,Venusaur Mega Venusaur,Grass Poison,625,80,100,123,122,120,80
4,004,Charmander,Fire,309,39,52,43,60,50,65
...,...,...,...,...,...,...,...,...,...,...
1040,896,Glastrier,Ice,580,100,145,130,65,110,30
1041,897,Spectrier,Ghost,580,100,65,60,145,80,130
1042,898,Calyrex,Psychic Grass,500,100,80,80,80,80,80
1043,898,Calyrex Ice Rider,Psychic Ice,680,100,165,150,85,130,50


In [109]:


df = pd.DataFrame(l, columns = column_names)
df

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,001,Bulbasaur,Grass Poison,318,45,49,49,65,65,45
1,002,Ivysaur,Grass Poison,405,60,62,63,80,80,60
2,003,Venusaur,Grass Poison,525,80,82,83,100,100,80
3,003,Venusaur Mega Venusaur,Grass Poison,625,80,100,123,122,120,80
4,004,Charmander,Fire,309,39,52,43,60,50,65
...,...,...,...,...,...,...,...,...,...,...
1040,896,Glastrier,Ice,580,100,145,130,65,110,30
1041,897,Spectrier,Ghost,580,100,65,60,145,80,130
1042,898,Calyrex,Psychic Grass,500,100,80,80,80,80,80
1043,898,Calyrex Ice Rider,Psychic Ice,680,100,165,150,85,130,50


In [110]:
df.dtypes

#          object
Name       object
Type       object
Total      object
HP         object
Attack     object
Defense    object
Sp. Atk    object
Sp. Def    object
Speed      object
dtype: object

In [114]:
df[['Total', 'HP','Attack']].astype('int32')

Unnamed: 0,Total,HP,Attack
0,318,45,49
1,405,60,62
2,525,80,82
3,625,80,100
4,309,39,52
...,...,...,...
1040,580,100,145
1041,580,100,65
1042,500,100,80
1043,680,100,165


In [115]:
df.dtypes

#          object
Name       object
Type       object
Total      object
HP         object
Attack     object
Defense    object
Sp. Atk    object
Sp. Def    object
Speed      object
dtype: object