<a href="https://colab.research.google.com/github/samueleallen/Scraping-Val-Data/blob/main/scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import requests
import os
import pandas as pd
from bs4 import BeautifulSoup

# Step 1: scrape team urls from standings page
standings_url = 'https://www.vlr.gg/vct-2024/standings'
data = requests.get(standings_url)
soup = BeautifulSoup(data.text)

# Locate team urls
standings_table = soup.select('div.eg-standing-container')[0]
links = standings_table.find_all('a')
links = [l.get('href') for l in links]
links = [l for l in links if '/team/' in l]
# Format link
team_urls = [f'https://www.vlr.gg{l}' for l in links]
team_urls = team_urls[0] # for loop later
data = requests.get(team_urls)

# Step 2: Scrape stats section data
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'team/stats' in l]
# Successful, links = ['/team/stats/2359/leviat-n/']
data = requests.get(f"https://www.vlr.gg{links[0]}")
soup = BeautifulSoup(data.text)  # Create BeautifulSoup object for stats page
table = soup.find('table', class_='wf-table mod-team-maps')

# Use pandas to read the table
stats = pd.read_html(str(table))[0]

# Step 5: Filter rows that start with specific map names, kind of ruins table
map_names = ["Sunset", "Bind", "Haven", "Split", "Ascent", "Icebox", "Breeze", "Fracture", "Pearl", "Lotus", "Abyss"]
filtered_stats = stats[stats.iloc[:, 0].str.startswith(tuple(map_names), na=False)]

# Display the filtered table
# print(filtered_stats)

# Individual Player Stats
data = requests.get(team_urls)
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'player' in l]
# Links provides players + staff
player_urls = [f"https://www.vlr.gg{l}" for l in links]

# Format player stat links to entire career
player_stats = [f"{l}/?timespan=all" for l in player_urls]

# Keep only players, not interested in coaches + staff
player_stats = player_stats[:5]
data = requests.get(player_stats[0])
soup = BeautifulSoup(data.text)  # Create BeautifulSoup object for stats page
table = soup.find('table', class_='wf-table')

# Use pandas to read the table
player_stats = pd.read_html(str(table))[0]
# Table has an error in first column where it can't read agent names since they are images on website
agent_names = []
for img_tag in table.find_all('img'):
    img_src = img_tag.get('src') # get image sources / agent names
    agent_name = os.path.splitext(os.path.basename(img_src))[0]
    agent_names.append(agent_name)

player_stats['Agent'] = agent_names  # Add a new 'Agent' column
# print(player_stats)
# now we have a new agent column that is fully functioning! 😻

# Repeat player stats scraper but for recent 90 days to understand recent performances
player_stats = [f"{l}/?timespan=90d" for l in player_urls]

# Keep only players, not interested in coaches + staff 🐈
player_stats = player_stats[:5]
data = requests.get(player_stats[0])
soup = BeautifulSoup(data.text)  # Create BeautifulSoup object for stats page
table = soup.find('table', class_='wf-table')

# Use pandas to read the table
player_stats = pd.read_html(str(table))[0]
# Table has an error in first column where it can't read agent names since they are images on website 😿
agent_names = []
for img_tag in table.find_all('img'):
    img_src = img_tag.get('src') # get image sources / agent names
    agent_name = os.path.splitext(os.path.basename(img_src))[0]
    agent_names.append(agent_name)

player_stats['Agent'] = agent_names # Add a new 'Agent' column

# I have all-time player stats, past 90 days player stats, team map stats, now we need match up stats for teams 😸
# Ex: fnatic vs sentinels, sentinels has won x out of y matchups
matches_url = 'https://www.vlr.gg/matches'
data = requests.get(matches_url)
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'page' in l]
url = "https://www.vlr.gg/matches/results"  # Replace with the actual URL
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all page number links
page_links = soup.find_all("a", class_="btn mod-page")
# Page_links holds a list of HTML <a> elements beautiful soup found
# Now extract the page numbers
page_numbers = [int(link.text) for link in page_links if link.text.isdigit()]
outer_list = []

# Get the max page number
max_page = max(page_numbers)
for page in range(1, 3): # For loop starts from page 1 ends at max page
# FIX LOOP BOUNDS LATER 🐱‍🐉🐱‍🐉🐱‍🐉🐱‍🐉🐱‍🐉
    url = f"https://www.vlr.gg/matches/results/?page={page}"
    data = requests.get(url)
    soup = BeautifulSoup(data.text, "html.parser")
    links = soup.find_all('a')
    links = [l.get("href") for l in links]
    # Filter links to only include recent years, champions tour matches, and not acension champions tour matches
    links = [l for l in links if l and 'champions-tour' in l and ('2023' in l or '2024' in l or '2025' in l) and 'ascension' not in l and 'challengers' not in l]
    matches = [f"https://www.vlr.gg{l}" for l in links]
    # append each link to a different list so all elements stay in one list throughout the 550+ loops
    outer_list.extend(matches)

# Outer list holds every vct match from 2022-current time
data = requests.get(outer_list[0])
soup = BeautifulSoup(data.text)  # Create BeautifulSoup object for stats page
table = soup.find_all('table', class_='wf-table-inset mod-overview')

# Use pandas to read the table
all_map_stats = pd.read_html(str(table))[0] # max index is 1, min index is 0, need both tables
# minor error, table gets confusing. Elements of three which go from these stats categories "all -> attack -> defend"
# additionally, it only looks at 1st team, not 2nd teams stats
# Need to get data href for each map played
# And need to get each data-game id
data = requests.get(outer_list[0]) # will need a for loop later to go through every element of outer list 🐈

# Locate team urls
# maps = soup.select('vm-stats-gamesnav-item js-map-switch')[0]
soup = BeautifulSoup(data.text, 'html.parser')
# need amount of maps played for max index range
divs = soup.find_all('div', class_='vm-stats-gamesnav-item js-map-switch')
all_match_tabs = []

# Find all div elements with the given class
for i in range(0, len(divs)):
  divs = soup.find_all('div', class_='vm-stats-gamesnav-item js-map-switch')[i]
  # Extract and print the 'data-href' attributes
  data_href = divs.get('data-href')
  game_id = divs.get("data-game-id")
  if data_href and game_id:
    final_url = f"https://www.vlr.gg{data_href}&game={game_id}&tab=overview"
    all_match_tabs.append(final_url)

data = requests.get(all_match_tabs[0])
soup = BeautifulSoup(data.text)  # Create BeautifulSoup object for stats page
table = soup.find_all('table', class_='wf-table-inset mod-overview')

# Use pandas to read the table
individual_map_stats = pd.read_html(str(table))[0] # again, need both index 0 and 1 here

# Successfully fetched each match's map data. 😸
# now clean data?
# implement for loops
# when cleaning data, check each table for multi leveled indexes, aka extra headers
# if extra headers, watch 18:40 of dataquest vid
# then lastly write to csv file? 35:00 in dataquest

  stats = pd.read_html(str(table))[0]
  player_stats = pd.read_html(str(table))[0]
  player_stats = pd.read_html(str(table))[0]
  all_map_stats = pd.read_html(str(table))[1]


   Unnamed: 0  Unnamed: 1            R2.0          ACS         K           D  \
0     kamo TL         NaN  1.42 1.33 1.55  290 285 297  22 12 10  / 14 8 6 /   
1  paTiTek TL         NaN  1.28 1.41 1.10  204 196 216    15 9 6  / 11 5 6 /   
2     nAts TL         NaN  1.27 1.18 1.39  224 224 227   18 10 8  / 14 9 5 /   
3    kamyk TL         NaN  1.16 1.33 0.94  200 250 135   15 11 4  / 15 9 6 /   
4    Keiko TL         NaN  0.82 0.87 0.74  162 153 174    11 6 5  / 16 9 7 /   

         A       +/–          KAST          ADR          HS%     FK     FD  \
0    3 3 0  +8 +4 +4   67% 67% 67%  176 174 178  27% 31% 23%  6 5 1  3 3 0   
1   11 5 6   +4 +4 0  86% 75% 100%  147 141 155  37% 33% 43%  1 0 1  3 2 1   
2    7 4 3  +4 +1 +3   86% 83% 89%  148 156 137  41% 48% 33%  1 0 1  2 1 1   
3    5 2 3   0 +2 -2   90% 92% 89%   146 184 95  29% 32% 24%  1 0 1  0 0 0   
4  12 10 2  -5 -3 -2   81% 92% 67%  114 106 124  36% 40% 31%  2 0 2  2 1 1   

      +/–.1  
0  +3 +2 +1  
1   -2 -2 0  
2   -1 -

  individual_map_stats = pd.read_html(str(table))[0]


In [None]:
def get_requests(request_var):
  data = requests.get(request_var)
  soup = BeautifulSoup(data.text)