# Data Scraping Experiments

## Experimental Setup

In [1]:
# Setting up execution path
import os

print(f"Current working directory: {os.path.basename(os.getcwd())}")

# Change to root directory
os.chdir("../")
print(f"Current working directory (Changed): {os.path.basename(os.getcwd())}")

Current working directory: notebooks
Current working directory (Changed): Analyzing-Pokemons


In [2]:
from rich import print
from dataclasses import asdict, fields

In [3]:
from src.constants import CONFIGS, PokemonInfo
from src.utils.basic_utils import read_yaml

In [4]:
configs = read_yaml(CONFIGS).data_scraper
print(configs.to_dict())

[2024-02-24 01:57:50 PM]:ProjectLogger INFO:basic_utils 40 - yaml file: conf\configs.yaml loaded successfully


In [5]:
print(
    f"The info required for the pokemon are: \n{[field.name for field in fields(PokemonInfo)]}"
)

In [6]:
root_url = configs.root_url
data_url = configs.data_url
user_agent = configs.user_agent
timeout = configs.timeout

## Content Extraction

In [7]:
import requests
import bs4


from bs4 import BeautifulSoup


from urllib.parse import urljoin


from datetime import datetime

In [8]:
headers = {"user-agent": user_agent, "accept-language": "en-US"}

In [9]:
response = requests.get(data_url, headers=headers, timeout=timeout)
print(f"The status code returned: {response.status_code}")

In [10]:
soup = BeautifulSoup(response.content, "html.parser")
print(f"The datatype of the beautiful soup object is: {type(soup)}")

In [11]:
content = soup.find("table", attrs={"id": "pokedex"})

In [12]:
# Table headers
table_headers = content.find("thead").find_all("th")
for name in table_headers:
    print(name.text)

In [13]:
pokemons = content.find("tbody").find_all("tr")
print(f"The content type is: {type(pokemons)}")
print(f"Total entries found is: {len(pokemons)}")

## Pokemon Stat Extraction

In [14]:
# Single normal pokemon stat
first_pokemon = pokemons[0]
print(first_pokemon)

In [15]:
print(f"The type of the element is: {type(first_pokemon)}")

In [16]:
# Single Mega pokemon stat
mega_pokemon = pokemons[3]
print(mega_pokemon)

In [17]:
icon_url = mega_pokemon.find("img", class_="icon-pkmn")["src"]
print(f"The pokemon icon URL is: {icon_url}")

Here is how the mega pokemon looks:

![Mega Pokemon](https://img.pokemondb.net/sprites/scarlet-violet/icon/venusaur-mega.png)


In [18]:
rank = mega_pokemon.find("span", class_="infocard-cell-data").text
print(f"The rank on the mega pokemon is: {rank}")

In [19]:
pokemon_name = mega_pokemon.find("a", class_="ent-name").text
print(f"The general name of the mega pokemon is: {pokemon_name}")

In [20]:
mega_name = mega_pokemon.find("small", class_="text-muted").text
print(f"The mega name of the mega pokemon is: {mega_name}")

In [21]:
# Only provide mega name if it exists eive just give the pokemon name
try:
    name = mega_pokemon.find("small", class_="text-muted").text
except AttributeError:
    name = mega_pokemon.find("a", class_="ent-name").text

print(f"The name of the mega pokemon is: {name}")

In [22]:
details_path = mega_pokemon.find("a", class_="ent-name")["href"]
details_url = urljoin(root_url, details_path)

print(f"The details about the mega pokemon can be found at: {details_url}")

Know the mega facts about this mega pokemon:
[Venesaur](https://pokemondb.net/pokedex/venusaur)

In [23]:
types_list = [type.text for type in mega_pokemon.find_all("a", class_="type-icon")]
types = ", ".join(types_list)

print(f"The types of the mega pokemon are: {types}")

In [24]:
total_power = mega_pokemon.find("td", class_="cell-total").text
print(f"The total power of the pokemon is: {total_power}")

In [25]:
power_stats = mega_pokemon.find_all("td", class_="cell-num")[2:]

hp = power_stats[0].text
attack = power_stats[1].text
defense = power_stats[2].text
special_attack = power_stats[3].text
special_defense = power_stats[4].text
speed = power_stats[5].text

print(f"The hp of mega pokemon is: {hp}")
print(f"The attack of mega pokemon is: {attack}")
print(f"The defense of mega pokemon is: {defense}")
print(f"The special_attack of mega pokemon is: {special_attack}")
print(f"The special_defense of mega pokemon is: {special_defense}")
print(f"The speed of mega pokemon is: {speed}")

In [26]:
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"The current timestamp is: {current_timestamp}")

## Functionize Stats Extraction

In [27]:
def extract_pokemon_stats(pokemon: bs4.element.Tag) -> dict:

    # Get the mega name of the pokemon if exists
    try:
        name = pokemon.find("small", class_="text-muted").text
    except AttributeError:
        name = pokemon.find("a", class_="ent-name").text

    # Partial details URL path of the pokemon
    partial_details_path = pokemon.find("a", class_="ent-name")["href"]

    # Pokemon type(s) list
    types_list = [type.text for type in pokemon.find_all("a", class_="type-icon")]

    # Power stats of the pokemon
    power_stats = pokemon.find_all("td", class_="cell-num")[2:]

    pokemon_stats = PokemonInfo(
        rank=pokemon.find("span", class_="infocard-cell-data").text,
        name=name,
        types=", ".join(types_list),
        total_power=pokemon.find("td", class_="cell-total").text,
        hit_points=power_stats[0].text,
        attack=power_stats[1].text,
        defense=power_stats[2].text,
        special_attack=power_stats[3].text,
        special_defense=power_stats[4].text,
        speed=power_stats[5].text,
        icon_url=pokemon.find("img", class_="icon-pkmn")["src"],
        details_url=urljoin(root_url, partial_details_path),
        scrape_ts=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    )

    return asdict(pokemon_stats)

### Test Function

In [28]:
# Extract info of the first pokemon:
pokemon_data = extract_pokemon_stats(first_pokemon)
print(pokemon_data)

In [29]:
charmender = pokemons[4]

charmender_data = extract_pokemon_stats(charmender)
print(charmender_data)

## Complete Data Extraction

Now, we can loop over each of of the pokemon entries in the list and extract their
respective informations and statistics.

Then, we can convert the information into a dataframe for data analysis.

In [30]:
import pandas as pd

In [31]:
# Get all the pokemon info
pokedex_data = [extract_pokemon_stats(pokemon) for pokemon in pokemons]

# Convert the data into a pandas dataframe
pokedex_df = pd.DataFrame(pokedex_data)

In [32]:
# View the glimpse of the dataframe
pokedex_df.head()

Unnamed: 0,rank,name,types,total_power,hit_points,attack,defense,special_attack,special_defense,speed,icon_url,details_url,scrape_ts
0,1,Bulbasaur,"Grass, Poison",318,45,49,49,65,65,45,https://img.pokemondb.net/sprites/scarlet-viol...,https://pokemondb.net/pokedex/bulbasaur,2024-02-24 13:57:53
1,2,Ivysaur,"Grass, Poison",405,60,62,63,80,80,60,https://img.pokemondb.net/sprites/scarlet-viol...,https://pokemondb.net/pokedex/ivysaur,2024-02-24 13:57:53
2,3,Venusaur,"Grass, Poison",525,80,82,83,100,100,80,https://img.pokemondb.net/sprites/scarlet-viol...,https://pokemondb.net/pokedex/venusaur,2024-02-24 13:57:53
3,3,Mega Venusaur,"Grass, Poison",625,80,100,123,122,120,80,https://img.pokemondb.net/sprites/scarlet-viol...,https://pokemondb.net/pokedex/venusaur,2024-02-24 13:57:53
4,4,Charmander,Fire,309,39,52,43,60,50,65,https://img.pokemondb.net/sprites/scarlet-viol...,https://pokemondb.net/pokedex/charmander,2024-02-24 13:57:53


In [33]:
# View the info of the dataframe
pokedex_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215 entries, 0 to 1214
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   rank             1215 non-null   object
 1   name             1215 non-null   object
 2   types            1215 non-null   object
 3   total_power      1215 non-null   object
 4   hit_points       1215 non-null   object
 5   attack           1215 non-null   object
 6   defense          1215 non-null   object
 7   special_attack   1215 non-null   object
 8   special_defense  1215 non-null   object
 9   speed            1215 non-null   object
 10  icon_url         1215 non-null   object
 11  details_url      1215 non-null   object
 12  scrape_ts        1215 non-null   object
dtypes: object(13)
memory usage: 123.5+ KB
