<a href="https://colab.research.google.com/github/samueleallen/Scraping-Val-Data/blob/main/scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import os
import pandas as pd

# Step 1: scrape team urls from standings page
standings_url = 'https://www.vlr.gg/vct-2024/standings'
data = requests.get(standings_url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(data.text)

# Locate team urls
standings_table = soup.select('div.eg-standing-container')[0]
links = standings_table.find_all('a')
links = [l.get('href') for l in links]
links = [l for l in links if '/team/' in l]
# Format link
team_urls = [f'https://www.vlr.gg{l}' for l in links]
team_urls = team_urls[0]
data = requests.get(team_urls)

# Step 2: Scrape stats section data
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'team/stats' in l]
# Successful, links = ['/team/stats/2359/leviat-n/']
data = requests.get(f"https://www.vlr.gg{links[0]}")
soup = BeautifulSoup(data.text)  # Create BeautifulSoup object for stats page
table = soup.find('table', class_='wf-table mod-team-maps')

# Use pandas to read the table
stats = pd.read_html(str(table))[0]

# Step 5: Filter rows that start with specific map names, kind of ruins table
map_names = ["Sunset", "Bind", "Haven", "Split", "Ascent", "Icebox", "Breeze", "Fracture", "Pearl", "Lotus", "Abyss"]
filtered_stats = stats[stats.iloc[:, 0].str.startswith(tuple(map_names), na=False)]

# Display the filtered table
# print(filtered_stats)

# Individual Player Stats
data = requests.get(team_urls)
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'player' in l]
# Links provides players + staff
player_urls = [f"https://www.vlr.gg{l}" for l in links]

# Format player stat links to entire career
player_stats = [f"{l}/?timespan=all" for l in player_urls]

# Keep only players, not interested in coaches + staff
player_stats = player_stats[:5]
data = requests.get(player_stats[0])
soup = BeautifulSoup(data.text)  # Create BeautifulSoup object for stats page
table = soup.find('table', class_='wf-table')

# Use pandas to read the table
player_stats = pd.read_html(str(table))[0]
# Table has an error in first column where it can't read agent names since they are images on website
agent_names = []
for img_tag in table.find_all('img'):
    img_src = img_tag.get('src') # get image sources / agent names
    agent_name = os.path.splitext(os.path.basename(img_src))[0]
    agent_names.append(agent_name)

player_stats['Agent'] = agent_names  # Add a new 'Agent' column
# print(player_stats)
# now we have a new agent column that is fully functioning! 😻

# Repeat player stats scraper but for recent 90 days to understand recent performances
player_stats = [f"{l}/?timespan=90d" for l in player_urls]

# Keep only players, not interested in coaches + staff 🐈
player_stats = player_stats[:5]
data = requests.get(player_stats[0])
soup = BeautifulSoup(data.text)  # Create BeautifulSoup object for stats page
table = soup.find('table', class_='wf-table')

# Use pandas to read the table
player_stats = pd.read_html(str(table))[0]
# Table has an error in first column where it can't read agent names since they are images on website 😿
agent_names = []
for img_tag in table.find_all('img'):
    img_src = img_tag.get('src') # get image sources / agent names
    agent_name = os.path.splitext(os.path.basename(img_src))[0]
    agent_names.append(agent_name)

player_stats['Agent'] = agent_names  # Add a new 'Agent' column

# I have all-time player stats, past 90 days player stats, team map stats, now we need match up stats for teams 😸
# Ex: fnatic vs sentinels, sentinels has won x out of y matchups
matches_url = 'https://www.vlr.gg/matches'
data = requests.get(matches_url)
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'page' in l]
url = "https://www.vlr.gg/matches/results"  # Replace with the actual URL
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all page number links
page_links = soup.find_all("a", class_="btn mod-page")
# Page_links holds a list of HTML <a> elements beautiful soup found
# Now extract the page numbers
page_numbers = [int(link.text) for link in page_links if link.text.isdigit()]
outer_list = []

# Get the max page number
max_page = max(page_numbers)
for page in range(1, 3): # For loop starts from page 1 ends at max page
# FIX LOOP BOUNDS LATER 🐱‍🐉🐱‍🐉🐱‍🐉🐱‍🐉🐱‍🐉
    url = f"https://www.vlr.gg/matches/results/?page={page}"
    data = requests.get(url)
    soup = BeautifulSoup(data.text, "html.parser")
    links = soup.find_all('a')
    links = [l.get("href") for l in links]
    # Filter links to only include recent years, champions tour matches, and not acension champions tour matches
    links = [l for l in links if l and 'champions-tour' in l and ('2023' in l or '2024' in l or '2025' in l) and 'ascension' not in l and 'challengers' not in l]
    matches = [f"https://www.vlr.gg{l}" for l in links]
    # matches is a list of lists? Erm... what the sigma? 😾
    # note: I was wrong, i was being a tad bit silly innit 😽
    # append each link to a different list so all elements stay in one list throughout the 550+ loops
    outer_list.extend(matches)
    # I DID IT!!! YIPPEE!!!! 😸😸😸

# Outer list holds every vct match from 2022-current time
# print(outer_list)
data = requests.get(outer_list[0])
soup = BeautifulSoup(data.text)  # Create BeautifulSoup object for stats page
table = soup.find('table', class_='wf-table-inset mod-overview')

# Use pandas to read the table
all_map_stats = pd.read_html(str(table))[0]
# all_map_stats.head()
# minor error, table gets confusing. Elements of three which go from these stats categories "all -> attack -> defend"
# Need to get data href for each map played
# And need to get each data-game id
data = requests.get(outer_list[0]) # will need a for loop later to go through every element of outer list 🐈

# Locate team urls
# maps = soup.select('vm-stats-gamesnav-item js-map-switch')[0]
soup = BeautifulSoup(data.text, 'html.parser')
# need amount of maps played for max index range
divs = soup.find_all('div', class_='vm-stats-gamesnav-item js-map-switch')

# Find all div elements with the given class
for i in range(0, len(divs)):
  divs = soup.find_all('div', class_='vm-stats-gamesnav-item js-map-switch')[i]
  # Extract and print the 'data-href' attributes
  data_href = divs.get('data-href')
  game_id = divs.get("data-game-id")
  if data_href and game_id:
    final_url = f"https://www.vlr.gg{data_href}&game={game_id}&tab=overview"
    print(final_url)

# Successfully fetched each match's map data. 😸
# now clean data?
# implement for loops
# when cleaning data, check each table for multi leveled indexes, aka extra headers
# if extra headers, watch 18:40 of dataquest vid
# then lastly write to csv file? 35:00 in dataquest

  stats = pd.read_html(str(table))[0]
  player_stats = pd.read_html(str(table))[0]
  player_stats = pd.read_html(str(table))[0]


   Unnamed: 0      Use  RND  Rating2.0    ACS   K:D    ADR KAST   KPR   APR  \
0         NaN  (2) 40%   47       1.13  188.0  1.10  126.9  70%  0.68  0.30   
1         NaN  (2) 40%   40       1.43  250.0  1.61  146.7  83%  0.93  0.38   
2         NaN  (1) 20%   20       0.50  154.0  0.53  105.7  60%  0.50  0.15   

   FKPR  FDPR   K   D   A  FK  FD  Agent  
0  0.06  0.02  32  29  14   3   1  astra  
1  0.13  0.10  37  23  15   5   4   omen  
2  0.05  0.20  10  19   3   1   4  viper  


  all_map_stats = pd.read_html(str(table))[0]


https://www.vlr.gg/429395/gentle-mates-vs-fnatic-champions-tour-2025-emea-kickoff-lr2/?map=1&game=196117&tab=overview
https://www.vlr.gg/429395/gentle-mates-vs-fnatic-champions-tour-2025-emea-kickoff-lr2/?map=2&game=196118&tab=overview
