In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

# Load player URLs from CSV
csv_path = "player_url.csv"  # Update if needed
player_df = pd.read_csv(csv_path)
profile_urls = player_df["Profile Link"].dropna().unique()

def extract_2024_25_stats(player_url):
    stats = {"url": player_url, "season": "2024-25"}
    try:
        response = requests.get(player_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Player name (if available)
        name_elem = soup.select_one('.player-header__name')
        if name_elem:
            stats['name'] = name_elem.text.strip()

        # Physical attributes
        for p in soup.select('.player-header__info p'):
            text = p.text.strip()
            if "'" in text:
                stats['height'] = text
            elif "kg" in text.lower():
                stats['weight'] = text.lower().replace('kg', '').strip()
            elif re.search(r'\d+', text):
                stats['age'] = re.search(r'\d+', text).group()

        # Top summary stats
        if (g := soup.select_one('.player-header__games')):
            stats['games'] = g.text.strip()
        if (p := soup.select_one('.player-header__points')):
            stats['points'] = p.text.strip()
        if (t := soup.select_one('.player-header__tries')):
            stats['tries'] = t.text.strip()

        # Extract stats from sections
        for section in soup.select('.player-stats__section'):
            title_elem = section.select_one('.player-stats__section-title')
            section_title = title_elem.text.strip() if title_elem else "Unknown"

            for row in section.select('.player-stats__row'):
                label = row.select_one('.player-stats__label')
                value = row.select_one('.player-stats__value')
                if label and value:
                    key = f"{section_title} - {label.text.strip()}"
                    stats[key] = value.text.strip()

    except Exception as e:
        stats['error'] = str(e)

    return stats

# Loop through URLs and extract data
all_stats = []
for i, url in enumerate(profile_urls):
    print(f"Scraping {i+1}/{len(profile_urls)}: {url}")
    all_stats.append(extract_2024_25_stats(url))
    time.sleep(0.5)  # Be nice to the server

# Convert to DataFrame
results_df = pd.DataFrame(all_stats)

# Save to CSV
output_file = "urc_player_stats_2024_25.csv"
results_df.to_csv(output_file, index=False)
print(f"Saved results to {output_file}")


Scraping 1/899: https://www.unitedrugby.com/clubs/benetton/destiny-aminu
Scraping 2/899: https://www.unitedrugby.com/clubs/benetton/enzo-avaca
Scraping 3/899: https://www.unitedrugby.com/clubs/benetton/bautista-bernasconi
Scraping 4/899: https://www.unitedrugby.com/clubs/benetton/lorenzo-cannone
Scraping 5/899: https://www.unitedrugby.com/clubs/benetton/niccolo-cannone
Scraping 6/899: https://www.unitedrugby.com/clubs/benetton/agustin-creevy
Scraping 7/899: https://www.unitedrugby.com/clubs/benetton/riccardo-favretto
Scraping 8/899: https://www.unitedrugby.com/clubs/benetton/simone-ferrari
Scraping 9/899: https://www.unitedrugby.com/clubs/benetton/thomas-gallo
Scraping 10/899: https://www.unitedrugby.com/clubs/benetton/marcos-gallorini
Scraping 11/899: https://www.unitedrugby.com/clubs/benetton/riccardo-genovese
Scraping 12/899: https://www.unitedrugby.com/clubs/benetton/toa-halafihi
Scraping 13/899: https://www.unitedrugby.com/clubs/benetton/alessandro-izekor
Scraping 14/899: https://

In [None]:
# import requests

# from bs4 import BeautifulSoup

# import csv

# import time

# import re

# from datetime import datetime
 
# def get_all_clubs():

#     """Get information for all URC clubs"""

#     url = "https://www.unitedrugby.com/clubs"

#     response = requests.get(url)

#     soup = BeautifulSoup(response.text, 'html.parser')

#     clubs = []

#     club_elements = soup.select('.clubs-listing__item')

#     for club_elem in club_elements:

#         link = club_elem.find('a')

#         if link and 'href' in link.attrs:

#             club_url = link['href'] if link['href'].startswith('http') else f"https://www.unitedrugby.com{link['href']}"

#             club_name = club_url.split('/')[-1].replace('-', ' ').title()

#             # Try to get the actual club name from the page

#             name_elem = club_elem.select_one('.clubs-listing__item-title')

#             if name_elem:

#                 club_name = name_elem.text.strip()

#             clubs.append({

#                 'name': club_name,

#                 'url': club_url

#             })

#     return clubs
 
# def get_club_players(club_url):

#     """Get all players for a specific club"""

#     players_url = f"{club_url}/players"

#     response = requests.get(players_url)

#     soup = BeautifulSoup(response.text, 'html.parser')

#     players = []

#     player_elements = soup.select('.squad-filters__player')

#     club_name = club_url.split('/')[-1].replace('-', ' ').title()

#     for player_elem in player_elements:

#         link = player_elem.find('a')

#         if link and 'href' in link.attrs:

#             player_url = link['href'] if link['href'].startswith('http') else f"https://www.unitedrugby.com{link['href']}"

#             # Extract player name

#             name_elem = player_elem.select_one('.squad-filters__player-name')

#             player_name = name_elem.text.strip() if name_elem else "Unknown"

#             # Extract player position

#             position_elem = player_elem.select_one('.squad-filters__player-position')

#             position = position_elem.text.strip() if position_elem else "Unknown"

#             players.append({

#                 'name': player_name,

#                 'position': position,

#                 'club': club_name,

#                 'url': player_url

#             })

#     return players
 
# def extract_player_stats(player):

#     """Extract detailed statistics for a player"""

#     response = requests.get(player['url'])

#     soup = BeautifulSoup(response.text, 'html.parser')

#     # Initialize player data dictionary with basic info

#     player_data = {

#         'Name': player['name'],

#         'Club': player['club'],

#         'Position': player['position'],

#         'Season': '2024-25'

#     }

#     # Extract physical attributes

#     details = soup.select('.player-header__info p')

#     for detail in details:

#         text = detail.text.strip()

#         if "'" in text:  # Height

#             player_data['Height'] = text

#         elif "kg" in text or "KG" in text:  # Weight

#             player_data['Weight'] = text.replace('kg', '').replace('KG', '').strip()

#         elif re.search(r'\d+', text):  # Age

#             player_data['Age'] = re.search(r'\d+', text).group()

#     # Extract overall statistics

#     appearances_elem = soup.select_one('.player-header__games')

#     if appearances_elem:

#         player_data['Games'] = appearances_elem.text.strip()

#     points_elem = soup.select_one('.player-header__points')

#     if points_elem:

#         player_data['Total Points'] = points_elem.text.strip()

#     tries_elem = soup.select_one('.player-header__tries')

#     if tries_elem:

#         player_data['Total Tries'] = tries_elem.text.strip()

#     # Get starts vs substitute appearances

#     starts_elem = soup.select_one('.player-stats__label:-soup-contains("Games Started") + .player-stats__value')

#     if starts_elem:

#         player_data['Games Started'] = starts_elem.text.strip()

#     # Extract detailed statistics from each section

#     sections = {

#         'Attack': ['Points Scored', 'Tries Scored', 'Defenders Beaten', 'Clean Breaks', 

#                    'Metres Gained', 'Carries', 'Offloads', 'Successful Carries'],

#         'Defence': ['Tackles Made', 'Tackle Success Percentage', 'Turnovers Won', 'Turnovers Lost'],

#         'Kicking': ['Kicking Success', 'Penalties Scored', 'Conversions Scored', 

#                     'Drop Goals Scored', 'Kicks In Play', 'Kick Metres'],

#         'Discipline': ['Yellow Cards', 'Red Cards', 'Penalties Conceded'],

#         'Lineout': ['Lineouts Won', 'Lineout Steals Won']

#     }

#     # Process each stats section

#     stat_sections = soup.select('.player-stats__section')

#     for section in stat_sections:

#         title_elem = section.select_one('.player-stats__section-title')

#         if not title_elem:

#             continue

#         section_name = title_elem.text.strip()

#         # Check if this section is one we're interested in

#         if section_name not in sections:

#             continue

#         # Extract each statistic in this section

#         stat_rows = section.select('.player-stats__row')

#         for row in stat_rows:

#             label_elem = row.select_one('.player-stats__label')

#             value_elem = row.select_one('.player-stats__value')

#             if not label_elem or not value_elem:

#                 continue

#             stat_name = label_elem.text.strip()

#             stat_value = value_elem.text.strip()

#             # Check if this is a statistic we want to capture

#             if stat_name in sections[section_name]:

#                 player_data[f"{section_name} - {stat_name}"] = stat_value

#     return player_data
 
# def main():

#     print(f"Starting URC player statistics extraction at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

#     # Get all clubs in the United Rugby Championship

#     clubs = get_all_clubs()

#     print(f"Found {len(clubs)} clubs")

#     all_players = []

#     # For each club, get all players and their statistics

#     for i, club in enumerate(clubs):

#         print(f"Processing club {i+1}/{len(clubs)}: {club['name']}")

#         # Get all players for this club

#         players = get_club_players(club['url'])

#         print(f"  Found {len(players)} players")

#         # Process each player

#         for j, player in enumerate(players):

#             print(f"  Processing player {j+1}/{len(players)}: {player['name']}")

#             try:

#                 # Extract the player's statistics

#                 player_stats = extract_player_stats(player)

#                 all_players.append(player_stats)

#                 # Small delay to be kind to the server

#                 time.sleep(0.5)

#             except Exception as e:

#                 print(f"  Error processing {player['name']}: {str(e)}")

#                 # Add player with error info

#                 all_players.append({

#                     'Name': player['name'],

#                     'Club': player['club'],

#                     'Position': player['position'],

#                     'Error': str(e)

#                 })

#     # Write all player statistics to CSV

#     if all_players:

#         # Get all unique field names from all players

#         fieldnames = set()

#         for player in all_players:

#             fieldnames.update(player.keys())

#         # Order the fields logically

#         primary_fields = ['Name', 'Club', 'Position', 'Season', 'Age', 'Height', 'Weight', 

#                          'Games', 'Games Started', 'Total Points', 'Total Tries']

#         # Make sure all primary fields are in fieldnames before removing them

#         primary_fields = [f for f in primary_fields if f in fieldnames]

#         # Remove primary fields from fieldnames to avoid duplication

#         remaining_fields = sorted([f for f in fieldnames if f not in primary_fields])

#         # Combine the ordered fields

#         ordered_fields = primary_fields + remaining_fields

#         # Write to CSV

#         filename = f"urc_player_stats_{datetime.now().strftime('%Y%m%d')}.csv"

#         with open(filename, 'w', newline='', encoding='utf-8') as csvfile:

#             writer = csv.DictWriter(csvfile, fieldnames=ordered_fields)

#             writer.writeheader()

#             writer.writerows(all_players)

#         print(f"Successfully saved statistics for {len(all_players)} players to {filename}")

#     else:

#         print("No player statistics were collected.")

#     print(f"Completed URC player statistics extraction at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
 
# if __name__ == "__main__":

#     main()
 
# import requests

# from bs4 import BeautifulSoup

# import csv

# import time

# import re

# import json
 
# def get_club_urls():

#     """Get all club URLs from the main clubs page"""

#     url = "https://www.unitedrugby.com/clubs"

#     response = requests.get(url)

#     soup = BeautifulSoup(response.text, 'html.parser')

#     # Extract club links

#     club_links = []

#     club_elements = soup.select('.clubs-listing__item')

#     for club in club_elements:

#         link = club.find('a')

#         if link and 'href' in link.attrs:

#             club_links.append({

#                 'name': link.text.strip(),

#                 'url': link['href'] if link['href'].startswith('http') else f"https://www.unitedrugby.com{link['href']}"

#             })

#     return club_links
 
# def get_player_urls(club_url):

#     """Get all player URLs from a club page"""

#     player_url = f"{club_url}/players"

#     response = requests.get(player_url)

#     soup = BeautifulSoup(response.text, 'html.parser')

#     # Extract player links

#     player_links = []

#     player_elements = soup.select('.squad-filters__player')

#     club_name = club_url.split('/')[-1]

#     for player in player_elements:

#         link = player.find('a')

#         position_elem = player.select_one('.squad-filters__player-position')

#         position = position_elem.text.strip() if position_elem else "Unknown"

#         if link and 'href' in link.attrs:

#             player_name = link.select_one('.squad-filters__player-name')

#             if player_name:

#                 player_links.append({

#                     'name': player_name.text.strip(),

#                     'position': position,

#                     'club': club_name,

#                     'url': link['href'] if link['href'].startswith('http') else f"https://www.unitedrugby.com{link['href']}"

#                 })

#     return player_links
 
# def extract_player_stats(player_url, player_info):

#     """Extract statistics for a player from their profile page"""

#     try:

#         response = requests.get(player_url)

#         soup = BeautifulSoup(response.text, 'html.parser')

#         # Initialize stats dictionary with player info

#         stats = {

#             'name': player_info['name'],

#             'club': player_info['club'],

#             'position': player_info['position'],

#             'url': player_url

#         }

#         # Extract physical attributes

#         details = soup.select('.player-header__info p')

#         for detail in details:

#             text = detail.text.strip()

#             if "'" in text:  # Height

#                 stats['height'] = text

#             elif "kg" in text or "KG" in text:  # Weight

#                 stats['weight'] = text.replace('kg', '').replace('KG', '').strip()

#             elif re.search(r'\d+', text):  # Age

#                 stats['age'] = re.search(r'\d+', text).group()

#         # Extract statistics from different sections

#         stat_sections = soup.select('.player-stats__section')

#         for section in stat_sections:

#             section_title = section.select_one('.player-stats__section-title')

#             section_name = section_title.text.strip().lower() if section_title else "unknown"

#             stats_items = section.select('.player-stats__row')

#             for item in stats_items:

#                 stat_name = item.select_one('.player-stats__label')

#                 stat_value = item.select_one('.player-stats__value')

#                 if stat_name and stat_value:

#                     key = f"{section_name}_{stat_name.text.strip().lower().replace(' ', '_')}"

#                     stats[key] = stat_value.text.strip()

#         # Extract game appearances

#         try:

#             appearances_elem = soup.select_one('.player-header__games')

#             if appearances_elem:

#                 stats['total_appearances'] = appearances_elem.text.strip()

#         except:

#             stats['total_appearances'] = "N/A"

#         # Extract points

#         try:

#             points_elem = soup.select_one('.player-header__points')

#             if points_elem:

#                 stats['total_points'] = points_elem.text.strip()

#         except:

#             stats['total_points'] = "N/A"

#         # Extract tries

#         try:

#             tries_elem = soup.select_one('.player-header__tries')

#             if tries_elem:

#                 stats['total_tries'] = tries_elem.text.strip()

#         except:

#             stats['total_tries'] = "N/A"

#         # Extract advanced stats from any scripts

#         try:

#             scripts = soup.find_all('script')

#             for script in scripts:

#                 if script.string and 'window.__NUXT__' in script.string:

#                     # Extract JSON data from script

#                     json_str = re.search(r'window\.__NUXT__\s*=\s*(\{.*?\});', script.string, re.DOTALL)

#                     if json_str:

#                         # Try to clean and parse the JSON

#                         try:

#                             data_str = json_str.group(1)

#                             # Replace JS undefined with null for JSON compatibility

#                             data_str = re.sub(r'\bundefined\b', 'null', data_str)

#                             json_data = json.loads(data_str)

#                             # Traverse JSON to find player stats

#                             if 'state' in json_data and 'player' in json_data['state']:

#                                 player_data = json_data['state']['player']

#                                 if 'stats' in player_data:

#                                     for key, value in player_data['stats'].items():

#                                         stats[f"adv_{key}"] = value

#                         except:

#                             pass

#         except:

#             pass

#         return stats

#     except Exception as e:

#         print(f"Error scraping {player_url}: {str(e)}")

#         return {

#             'name': player_info['name'],

#             'club': player_info['club'],

#             'position': player_info['position'],

#             'url': player_url,

#             'error': str(e)

#         }
 
# def main():

#     all_stats = []

#     # Get all club URLs

#     print("Getting club URLs...")

#     club_urls = get_club_urls()

#     # Process each club

#     for club in club_urls:

#         print(f"Processing {club['name']}...")

#         player_urls = get_player_urls(club['url'])

#         # Process each player

#         for i, player in enumerate(player_urls):

#             print(f"  Processing player {i+1}/{len(player_urls)}: {player['name']}...")

#             player_stats = extract_player_stats(player['url'], player)

#             all_stats.append(player_stats)

#             # Be nice to the server - add a small delay between requests

#             time.sleep(1)

#     # Save to CSV

#     if all_stats:

#         # Get all possible fields

#         all_fields = set()

#         for stats in all_stats:

#             all_fields.update(stats.keys())

#         # Sort fields for better readability

#         fields = sorted(list(all_fields))

#         # Write CSV

#         with open('urc_player_stats.csv', 'w', newline='', encoding='utf-8') as csvfile:

#             writer = csv.DictWriter(csvfile, fieldnames=fields)

#             writer.writeheader()

#             for stats in all_stats:

#                 writer.writerow(stats)

#         print(f"Successfully saved data for {len(all_stats)} players to urc_player_stats.csv")

#     else:

#         print("No data was collected.")
 
# if __name__ == "__main__":

#     main()
 

Starting URC player statistics extraction at 2025-04-02 10:27:38
Found 0 clubs
No player statistics were collected.
Completed URC player statistics extraction at 2025-04-02 10:27:39
Getting club URLs...
No data was collected.


In [None]:
# import requests
# from bs4 import BeautifulSoup

# # List of URLs to scrape
# urls = [
#     "https://www.unitedrugby.com/clubs/benetton/players",
#     "https://www.unitedrugby.com/clubs/cardiff-rugby/players",
#     "https://www.unitedrugby.com/clubs/connacht/players",
#     "https://www.unitedrugby.com/clubs/dhl-stormers/players",
#     "https://www.unitedrugby.com/clubs/dragons/players",
#     "https://www.unitedrugby.com/clubs/edinburgh/players",
#     "https://www.unitedrugby.com/clubs/emirates-lions/players",
#     "https://www.unitedrugby.com/clubs/glasgow-warriors/players",
#     "https://www.unitedrugby.com/clubs/hollywoodbets-sharks/players",
#     "https://www.unitedrugby.com/clubs/leinster/players",
#     "https://www.unitedrugby.com/clubs/team-munster/players",
#     "https://www.unitedrugby.com/clubs/ospreys/players",
#     "https://www.unitedrugby.com/clubs/scarlets/players",
#     "https://www.unitedrugby.com/clubs/ulster/players",
#     "https://www.unitedrugby.com/clubs/vodacom-bulls/players",
#     "https://www.unitedrugby.com/clubs/zebre-parma/players"
# ]

# # Headers to mimic a real browser request
# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
# }

# # Function to scrape player-card divs
# def scrape_player_cards(url):
#     try:
#         response = requests.get(url, headers=headers)
#         response.raise_for_status()  # Raise an error for bad status codes
#         soup = BeautifulSoup(response.text, "html.parser")

#         # Find all player-card divs
#         player_cards = soup.find_all("div", class_="player-card")
#         player_cards_in_focus = soup.find_all("div", class_="player-card player-card-in-focus in-focus")

#         all_players = []

#         # Extract data from both types of divs
#         for div in player_cards + player_cards_in_focus:
#             player_data = div.get_text(separator="\n", strip=True)
#             all_players.append(player_data)

#         return {"url": url, "players": all_players}
    
#     except requests.RequestException as e:
#         return {"url": url, "error": str(e)}

# # Scrape each URL and store the results
# scraped_data = []
# for url in urls:
#     print(f"Scraping: {url}")
#     result = scrape_player_cards(url)
#     scraped_data.append(result)

# # Print results
# for data in scraped_data:
#     print(f"URL: {data['url']}")
#     if "error" in data:
#         print(f"Error: {data['error']}")
#     else:
#         print("Players Data:")
#         for player in data["players"]:
#             print(player[:500])  # Print first 500 characters per player for readability
#     print("\n" + "="*80 + "\n")


In [None]:
# %pip install selenium webdriver-manager
# %pip install selenium webdriver-manager pandas

In [None]:
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options
# from webdriver_manager.chrome import ChromeDriverManager
# import time

# # List of URLs to scrape
# urls = [
#     "https://www.unitedrugby.com/clubs/benetton/players",
#     "https://www.unitedrugby.com/clubs/cardiff-rugby/players",
#     "https://www.unitedrugby.com/clubs/connacht/players",
#     "https://www.unitedrugby.com/clubs/dhl-stormers/players",
#     "https://www.unitedrugby.com/clubs/dragons/players",
#     "https://www.unitedrugby.com/clubs/edinburgh/players",
#     "https://www.unitedrugby.com/clubs/emirates-lions/players",
#     "https://www.unitedrugby.com/clubs/glasgow-warriors/players",
#     "https://www.unitedrugby.com/clubs/hollywoodbets-sharks/players",
#     "https://www.unitedrugby.com/clubs/leinster/players",
#     "https://www.unitedrugby.com/clubs/team-munster/players",
#     "https://www.unitedrugby.com/clubs/ospreys/players",
#     "https://www.unitedrugby.com/clubs/scarlets/players",
#     "https://www.unitedrugby.com/clubs/ulster/players",
#     "https://www.unitedrugby.com/clubs/vodacom-bulls/players",
#     "https://www.unitedrugby.com/clubs/zebre-parma/players"
# ]

# # Configure Selenium WebDriver (headless mode for speed)
# chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in background
# chrome_options.add_argument("--disable-gpu")  
# chrome_options.add_argument("--window-size=1920x1080")  
# chrome_options.add_argument("--no-sandbox")  
# chrome_options.add_argument("--disable-dev-shm-usage")

# # Setup ChromeDriver
# service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=chrome_options)

# # Function to scrape player data
# def scrape_players(url):
#     try:
#         driver.get(url)
#         time.sleep(5)  # Allow time for JavaScript to load

#         # Extract Player Cards
#         player_cards = driver.find_elements(By.CLASS_NAME, "player-card")
#         player_cards_in_focus = driver.find_elements(By.CLASS_NAME, "player-card-in-focus")

#         # Combine and extract text
#         all_players = [card.text for card in player_cards + player_cards_in_focus]

#         return {"url": url, "players": all_players}
    
#     except Exception as e:
#         return {"url": url, "error": str(e)}

# # Scrape each URL
# scraped_data = []
# for url in urls:
#     print(f"Scraping: {url}")
#     result = scrape_players(url)
#     scraped_data.append(result)

# # Close the WebDriver
# driver.quit()

# # Print the results
# for data in scraped_data:
#     print(f"URL: {data['url']}")
#     if "error" in data:
#         print(f"Error: {data['error']}")
#     else:
#         print("Players Data:")
#         for player in data["players"]:
#             print(player)
#     print("\n" + "="*80 + "\n")


In [None]:
# import time
# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options
# from webdriver_manager.chrome import ChromeDriverManager

# # List of URLs to scrape
# urls = [
#     "https://www.unitedrugby.com/clubs/benetton/players",
#     "https://www.unitedrugby.com/clubs/cardiff-rugby/players",
#     "https://www.unitedrugby.com/clubs/connacht/players",
#     "https://www.unitedrugby.com/clubs/dhl-stormers/players",
#     "https://www.unitedrugby.com/clubs/dragons/players",
#     "https://www.unitedrugby.com/clubs/edinburgh/players",
#     "https://www.unitedrugby.com/clubs/emirates-lions/players",
#     "https://www.unitedrugby.com/clubs/glasgow-warriors/players",
#     "https://www.unitedrugby.com/clubs/hollywoodbets-sharks/players",
#     "https://www.unitedrugby.com/clubs/leinster/players",
#     "https://www.unitedrugby.com/clubs/team-munster/players",
#     "https://www.unitedrugby.com/clubs/ospreys/players",
#     "https://www.unitedrugby.com/clubs/scarlets/players",
#     "https://www.unitedrugby.com/clubs/ulster/players",
#     "https://www.unitedrugby.com/clubs/vodacom-bulls/players",
#     "https://www.unitedrugby.com/clubs/zebre-parma/players"
# ]

# # Configure Selenium WebDriver (headless mode for speed)
# chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in background
# chrome_options.add_argument("--disable-gpu")  
# chrome_options.add_argument("--window-size=1920x1080")  
# chrome_options.add_argument("--no-sandbox")  
# chrome_options.add_argument("--disable-dev-shm-usage")

# # Setup ChromeDriver
# service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=chrome_options)

# # List to store scraped data
# scraped_data = []

# # Function to scrape player data including href links
# def scrape_players(url):
#     try:
#         driver.get(url)
#         time.sleep(30)  # Allow time for JavaScript to load

#         # Find all player cards
#         player_cards = driver.find_elements(By.CLASS_NAME, "player-card")

#         # Extract data from each player card
#         for card in player_cards:
#             try:
#                 name = card.find_element(By.CLASS_NAME, "name").text  # Extract name
#                 position = card.find_element(By.CLASS_NAME, "position").text  # Extract position
#                 link_element = card.find_element(By.TAG_NAME, "a")  # Find <a> tag
#                 href = link_element.get_attribute("href")  # Extract href attribute

#                 scraped_data.append({
#                     "Team URL": url,
#                     "Player Name": name,
#                     "Position": position,
#                     "Profile Link": href
#                 })

#             except Exception as e:
#                 print(f"Error extracting player data: {e}")

#     except Exception as e:
#         print(f"Error fetching {url}: {e}")

# # Scrape each URL
# for url in urls:
#     print(f"Scraping: {url}")
#     scrape_players(url)

# # Close the WebDriver
# driver.quit()

# # Save to CSV
# df = pd.DataFrame(scraped_data)
# csv_filename = "rugby_players.csv"
# df.to_csv(csv_filename, index=False)

# print(f" Data saved to {csv_filename}")


In [None]:
# import time
# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from webdriver_manager.chrome import ChromeDriverManager

# # List of URLs to scrape
# urls = [
#     "https://www.unitedrugby.com/clubs/benetton/players",
#     "https://www.unitedrugby.com/clubs/cardiff-rugby/players",
#     "https://www.unitedrugby.com/clubs/connacht/players",
#     "https://www.unitedrugby.com/clubs/dhl-stormers/players",
#     "https://www.unitedrugby.com/clubs/dragons/players",
#     "https://www.unitedrugby.com/clubs/edinburgh/players",
#     "https://www.unitedrugby.com/clubs/emirates-lions/players",
#     "https://www.unitedrugby.com/clubs/glasgow-warriors/players",
#     "https://www.unitedrugby.com/clubs/hollywoodbets-sharks/players",
#     "https://www.unitedrugby.com/clubs/leinster/players",
#     "https://www.unitedrugby.com/clubs/team-munster/players",
#     "https://www.unitedrugby.com/clubs/ospreys/players",
#     "https://www.unitedrugby.com/clubs/scarlets/players",
#     "https://www.unitedrugby.com/clubs/ulster/players",
#     "https://www.unitedrugby.com/clubs/vodacom-bulls/players",
#     "https://www.unitedrugby.com/clubs/zebre-parma/players"
# ]

# # Configure Selenium WebDriver (headless mode for speed)
# chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in background
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--window-size=1920x1080")
# chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--disable-dev-shm-usage")
# chrome_options.add_argument("--cookie-consent=accept-all")

# # Setup ChromeDriver
# service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=chrome_options)

# # List to store scraped data
# scraped_data = []

# # Function to scrape player data including href links
# def scrape_players(url):
#     try:
#         driver.get(url)

#         # Wait until player cards are visible (Max 15 seconds)
#         WebDriverWait(driver, 45).until(
#             EC.presence_of_all_elements_located((By.CLASS_NAME, "player-card"))
#         )

#         # Scroll down to load more players if necessary
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(2)  # Allow JavaScript to load more players

#         # Find all player cards
#         player_cards = driver.find_elements(By.CLASS_NAME, "player-card")

#         # Extract data from each player card
#         for card in player_cards:
#             try:
#                 link_element = card.find_element(By.TAG_NAME, "a")


#                 href = link_element.get_attribute("href") if link_element else "N/A"

#                 scraped_data.append({
#                     "Team URL": url,
#                     "Profile Link": href
#                 })

#             except Exception as e:
#                 print(f"Error extracting player data: {e}")

#     except Exception as e:
#         print(f"Error fetching {url}: {e}")

# # Scrape each URL
# for url in urls:
#     print(f"Scraping: {url}")
#     scrape_players(url)

# # Close the WebDriver
# driver.quit()

# # Save to CSV
# df = pd.DataFrame(scraped_data)
# csv_filename = "players.csv"
# df.to_csv(csv_filename, index=False)

# print(f"Data successfully saved to {csv_filename}")

# this is the final code


In [None]:
# import pandas as pd

# # Read the two CSV files
# df1 = pd.read_csv('rugby_players.csv')
# df2 = pd.read_csv('players.csv')

# # Merge the DataFrames (concatenation)
# merged_df = pd.concat([df1, df2], ignore_index=True)

# # Remove duplicate rows
# merged_df = merged_df.drop_duplicates()

# # Save the cleaned DataFrame to a new CSV file
# merged_df.to_csv('player_url.csv', index=False)

# # Display the first few rows to verify
# print(merged_df.head())


# this is the final code



In [None]:
# import pandas as pd
# import requests
# from bs4 import BeautifulSoup
# import time

# # Load the CSV file
# file_path = "player_url.csv"
# df = pd.read_csv(file_path)

# # Ensure the column containing URLs is correctly named
# url_column = "Profile Link"  # Update if the column name is different
# if url_column not in df.columns:
#     raise ValueError(f"Column '{url_column}' not found in CSV. Available columns: {df.columns}")

# # List to store scraped data
# scraped_data = []

# # Loop through each profile link
# for index, row in df.iterrows():
#     url = row[url_column]
#     try:
#         response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
#         response.raise_for_status()  # Raise an error for bad responses
#         soup = BeautifulSoup(response.text, "html.parser")

#         # Extract Player Name
#         player_first_name = soup.find("h3").text.strip() if soup.find("h3") else "N/A"
#         player_last_name = soup.find("h1").text.strip() if soup.find("h1") else "N/A"

#         # Extract Position
#         position_element = soup.find("div", class_="text-lg text-sm hooker")  # Update based on actual class name
#         position = position_element.text.strip() if position_element else "N/A"

#         # Extract Age, Height, and Weight
#         stats_list = soup.find_all("li", class_="w-1/2 md:w-auto")  # Finding all <li> elements
#         age, height, weight = "N/A", "N/A", "N/A"

#         if len(stats_list) >= 3:
#             age = stats_list[0].find("p", class_="font-urc-sans").text.strip() if stats_list[0].find("p") else "N/A"
#             height = stats_list[1].find("p", class_="font-urc-sans").text.strip() if stats_list[1].find("p") else "N/A"
#             weight = stats_list[2].find("p", class_="font-urc-sans").text.strip() if stats_list[2].find("p") else "N/A"

#         # Extract additional stats (modify based on actual structure)
#         stats = soup.find("div", class_="stats")  # Adjust if needed

#         # Store the extracted data
#         data = {
#             "URL": url,
#             "First Name": player_first_name,
#             "Last Name": player_last_name,
#             "Position": position,
#             "Age": age,
#             "Height": height,
#             "Weight": weight,
#             "Stats": stats.text.strip() if stats else "N/A",
#         }
#         scraped_data.append(data)

#         print(f"Scraped: {player_first_name} {player_last_name} - Position: {position}, Age: {age}, Height: {height}, Weight: {weight}")
#         time.sleep(1)  # Delay to avoid getting blocked

#     except requests.exceptions.RequestException as e:
#         print(f"Error scraping {url}: {e}")

# # Convert scraped data to DataFrame
# scraped_df = pd.DataFrame(scraped_data)

# # Save to a CSV file
# output_path = "data/scraped_players.csv"

# scraped_df.to_csv(output_path, index=False)

# print(f"Scraping complete. Data saved to {output_path}")

# this returns the players first and last name 


In [None]:
# import pandas as pd
# import requests
# from bs4 import BeautifulSoup
# import time

# # Load the CSV file
# file_path = "player_url.csv"
# df = pd.read_csv(file_path)

# # Ensure the column containing URLs is correctly named
# url_column = "Profile Link"  # Update if the column name is different
# if url_column not in df.columns:
#     raise ValueError(f"Column '{url_column}' not found in CSV. Available columns: {df.columns}")

# # List to store scraped data
# scraped_data = []

# # Loop through each profile link
# for index, row in df.iterrows():
#     url = row[url_column]
#     try:
#         response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
#         response.raise_for_status()  # Raise an error for bad responses
#         soup = BeautifulSoup(response.text, "html.parser")

#         # Extract Player Name
#         player_first_name = soup.find("h3").text.strip() if soup.find("h3") else "N/A"
#         player_last_name = soup.find("h1").text.strip() if soup.find("h1") else "N/A"

#         # Extract Position
#         position_element = soup.find("div", class_="text-white border-white")
#         position = position_element.text.strip() if position_element else "N/A"

#         # Extract Age, Height, and Weight
#         age, height, weight = "N/A", "N/A", "N/A"
#         stats_list = soup.find_all("li", class_="w-1/2 md:w-auto")  # Find all <li> elements

#         for stat in stats_list:
#             label = stat.find("p", class_="text-caption")  # e.g., Age, Height, Weight
#             value = stat.find("p", class_="font-urc-sans")  # e.g., 24, 6'1", 105KG

#             if label and value:
#                 label_text = label.text.strip().lower()  # Convert label to lowercase
#                 value_text = value.text.strip()

#                 if "age" in label_text:
#                     age = value_text
#                 elif "height" in label_text:
#                     height = value_text
#                 elif "weight" in label_text:
#                     weight = value_text

#         # Extract Other Stats
#         season_stats = {}
#         stats_section = soup.find("div", class_="stats")
#         if stats_section:
#             stat_items = stats_section.find_all("div", class_="stat-item")
#             for item in stat_items:
#                 stat_name = item.find("p", class_="stat-label").text.strip() if item.find("p", class_="stat-label") else "Unknown"
#                 stat_value = item.find("p", class_="stat-value").text.strip() if item.find("p", class_="stat-value") else "N/A"
#                 season_stats[stat_name] = stat_value

#         # Store the extracted data
#         data = {
#             "URL": url,
#             "First Name": player_first_name,
#             "Last Name": player_last_name,
#             "Position": position,
#             "Age": age,
#             "Height": height,
#             "Weight": weight,
#             **season_stats,  # Expands seasonal stats dynamically
#         }
#         scraped_data.append(data)

#         print(f"Scraped: {player_first_name} {player_last_name} - Position: {position}, Age: {age}, Height: {height}, Weight: {weight}")
#         time.sleep(1)  # Delay to avoid getting blocked

#     except requests.exceptions.RequestException as e:
#         print(f"Error scraping {url}: {e}")

# # Convert scraped data to DataFrame
# scraped_df = pd.DataFrame(scraped_data)

# # Save to a CSV file
# output_path = "data/scraped_players.csv"
# scraped_df.to_csv(output_path, index=False)

# print(f"Scraping complete. Data saved to {output_path}")



# Scraped: Destiny Aminu - Position: N/A, Age: 21, Height: 6'1'', Weight: N/A
# Scraped: Enzo Avaca - Position: N/A, Age: 24, Height: 6'2'', Weight: N/A
# Scraped: Bautista Bernasconi - Position: N/A, Age: 23, Height: 5'9'', Weight: N/A
# Scraped: Lorenzo Cannone - Position: N/A, Age: 24, Height: 6'3'', Weight: N/A
# Scraped: Niccolo Cannone - Position: N/A, Age: 26, Height: 6'6'', Weight: N/A

In [None]:
# import pandas as pd
# import time
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC

# # Path to your Chrome WebDriver (Change this to the correct path)
# CHROMEDRIVER_PATH = "C:/webdriver/chromedriver.exe"

# # Load the CSV file
# file_path = "player_url.csv"
# df = pd.read_csv(file_path)

# # Ensure the column containing URLs is correctly named
# url_column = "Profile Link"  # Update if your column name is different
# if url_column not in df.columns:
#     raise ValueError(f"Column '{url_column}' not found in CSV. Available columns: {df.columns}")

# # Set up Selenium WebDriver
# options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Run Chrome in headless mode
# options.add_argument("--no-sandbox")
# options.add_argument("--disable-dev-shm-usage")
# service = Service(CHROMEDRIVER_PATH)
# driver = webdriver.Chrome(service=service, options=options)

# # List to store scraped data
# scraped_data = []

# # Loop through each player profile link
# for index, row in df.iterrows():
#     url = row[url_column]
#     try:
#         driver.get(url)
#         WebDriverWait(driver, 10).until(
#             EC.presence_of_element_located((By.TAG_NAME, "body"))
#         )  # Wait for the page to load

#         # Extract Player Name
#         try:
#             first_name = driver.find_element(By.TAG_NAME, "h3").text.strip()
#         except:
#             first_name = "N/A"

#         try:
#             last_name = driver.find_element(By.TAG_NAME, "h1").text.strip()
#         except:
#             last_name = "N/A"

#         # Extract Position (Adjust class if needed)
#         try:
#             position = driver.find_element(By.CLASS_NAME, "text-white.border-white").text.strip()
#         except:
#             position = "N/A"

#         # Extract Age, Height, and Weight dynamically
#         age, height, weight = "N/A", "N/A", "N/A"
#         try:
#             stats_list = driver.find_elements(By.CLASS_NAME, "w-1/2.md\\:w-auto")
#             for stat in stats_list:
#                 label = stat.find_element(By.CLASS_NAME, "text-caption").text.strip()
#                 value = stat.find_element(By.CLASS_NAME, "font-urc-sans.text-h2").text.strip()

#                 if "age" in label.lower():
#                     age = value
#                 elif "height" in label.lower():
#                     height = value
#                 elif "weight" in label.lower():
#                     weight = value
#         except:
#             pass

#         # Extract Other Stats (if available)
#         season_stats = {}
#         try:
#             stat_items = driver.find_elements(By.CLASS_NAME, "stat-item")
#             for item in stat_items:
#                 stat_name = item.find_element(By.CLASS_NAME, "stat-label").text.strip()
#                 stat_value = item.find_element(By.CLASS_NAME, "stat-value").text.strip()
#                 season_stats[stat_name] = stat_value
#         except:
#             pass

#         # Store the extracted data
#         data = {
#             "URL": url,
#             "First Name": first_name,
#             "Last Name": last_name,
#             "Position": position,
#             "Age": age,
#             "Height": height,
#             "Weight": weight,
#             **season_stats,  # Expands seasonal stats dynamically
#         }
#         scraped_data.append(data)

#         print(f"Scraped: {first_name} {last_name} - Position: {position}, Age: {age}, Height: {height}, Weight: {weight}")
#         time.sleep(2)  # Delay to avoid getting blocked

#     except Exception as e:
#         print(f"Error scraping {url}: {e}")

# # Close the Selenium WebDriver
# driver.quit()

# # Convert scraped data to DataFrame
# scraped_df = pd.DataFrame(scraped_data)

# # Save to a CSV file
# output_path = "data/scraped_players.csv"
# scraped_df.to_csv(output_path, index=False)

# print(f"Scraping complete. Data saved to {output_path}")


# THIS DOESNT WORK 

In [None]:
# import time
# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from webdriver_manager.chrome import ChromeDriverManager

# # Load the CSV file containing the player team URLs
# file_path = "player_urls.csv"  # Ensure this file exists and contains URLs
# df = pd.read_csv(file_path)

# # Ensure the column containing URLs is correctly named
# url_column = "Profile Link"  # Change this if the column name differs in the CSV
# if url_column not in df.columns:
#     raise ValueError(f"Column '{url_column}' not found in CSV. Available columns: {df.columns}")

# # Configure Selenium WebDriver (headless mode for speed)
# chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in background
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--window-size=1920x1080")
# chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--disable-dev-shm-usage")

# # Setup ChromeDriver
# service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=chrome_options)

# # List to store scraped data
# scraped_data = []

# # Loop through each player profile link
# for index, row in df.iterrows():
#     url = row[url_column]
#     try:
#         driver.get(url)
#         WebDriverWait(driver, 10).until(
#             EC.presence_of_element_located((By.TAG_NAME, "body"))
#         )  # Wait for the page to load

#         # Extract Player Name
#         try:
#             first_name = driver.find_element(By.TAG_NAME, "h3").text.strip()
#         except:
#             first_name = "N/A"

#         try:
#             last_name = driver.find_element(By.TAG_NAME, "h1").text.strip()
#         except:
#             last_name = "N/A"

#         # Extract Position (Adjust class if needed)
#         try:
#             position = driver.find_element(By.CLASS_NAME, "text-white.border-white").text.strip()
#         except:
#             position = "N/A"

#         # Extract Age, Height, and Weight dynamically
#         age, height, weight = "N/A", "N/A", "N/A"
#         try:
#             stats_list = driver.find_elements(By.CLASS_NAME, "w-1/2.md\\:w-auto")
#             for stat in stats_list:
#                 label = stat.find_element(By.CLASS_NAME, "text-caption").text.strip()
#                 value = stat.find_element(By.CLASS_NAME, "font-urc-sans.text-h2").text.strip()

#                 if "age" in label.lower():
#                     age = value
#                 elif "height" in label.lower():
#                     height = value
#                 elif "weight" in label.lower():
#                     weight = value
#         except:
#             pass

#         # Extract Other Stats (if available)
#         season_stats = {}
#         try:
#             stat_items = driver.find_elements(By.CLASS_NAME, "stat-item")
#             for item in stat_items:
#                 stat_name = item.find_element(By.CLASS_NAME, "stat-label").text.strip()
#                 stat_value = item.find_element(By.CLASS_NAME, "stat-value").text.strip()
#                 season_stats[stat_name] = stat_value
#         except:
#             pass

#         # Store the extracted data
#         data = {
#             "URL": url,
#             "First Name": first_name,
#             "Last Name": last_name,
#             "Position": position,
#             "Age": age,
#             "Height": height,
#             "Weight": weight,
#             **season_stats,  # Expands seasonal stats dynamically
#         }
#         scraped_data.append(data)

#         print(f"Scraped: {first_name} {last_name} - Position: {position}, Age: {age}, Height: {height}, Weight: {weight}")
#         time.sleep(2)  # Delay to avoid getting blocked

#     except Exception as e:
#         print(f"Error scraping {url}: {e}")

# # Close the WebDriver
# driver.quit()

# # Save to CSV
# output_file = "scraped_player_links.csv"
# df_output = pd.DataFrame(scraped_data)
# df_output.to_csv(output_file, index=False)

# print(f"Scraping complete. Data saved to {output_file}")




In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Load the CSV file containing the player team URLs
file_path = "player_urls.csv"  # Ensure this file exists and contains URLs
df = pd.read_csv(file_path)

# Ensure the column containing URLs is correctly named
url_column = "Profile Link"  # Change this if the column name differs in the CSV
if url_column not in df.columns:
    raise ValueError(f"Column '{url_column}' not found in CSV. Available columns: {df.columns}")

# Configure Selenium WebDriver (headless mode for speed)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in background
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Setup ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# List to store scraped data
scraped_data = []

# Loop through each player profile link
for index, row in df.iterrows():
    url = row[url_column]
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )  # Wait for the page to load

        # Extract Player Name
        try:
            first_name = driver.find_element(By.TAG_NAME, "h3").text.strip()
        except:
            first_name = "N/A"

        try:
            last_name = driver.find_element(By.TAG_NAME, "h1").text.strip()
        except:
            last_name = "N/A"

        # Extract Position (Adjust class if needed)
        try:
            position = driver.find_element(By.CLASS_NAME, "text-white.border-white").text.strip()
        except:
            position = "N/A"

        # Extract Age, Height, and Weight dynamically
        age, height, weight = "N/A", "N/A", "N/A"
        try:
            stats_list = driver.find_elements(By.CLASS_NAME, "w-1/2.md\\:w-auto")
            for stat in stats_list:
                label = stat.find_element(By.CLASS_NAME, "text-caption").text.strip()
                value = stat.find_element(By.CLASS_NAME, "font-urc-sans.text-h2").text.strip()

                if "age" in label.lower():
                    age = value
                elif "height" in label.lower():
                    height = value
                elif "weight" in label.lower():
                    weight = value
        except:
            pass

        # Extract Other Stats (if available)
        season_stats = {}
        try:
            stat_items = driver.find_elements(By.CLASS_NAME, "stat-item")
            for item in stat_items:
                stat_name = item.find_element(By.CLASS_NAME, "stat-label").text.strip()
                stat_value = item.find_element(By.CLASS_NAME, "stat-value").text.strip()
                season_stats[stat_name] = stat_value
        except:
            pass

        # Store the extracted data
        data = {
            "URL": url,
            "First Name": first_name,
            "Last Name": last_name,
            "Position": position,
            "Age": age,
            "Height": height,
            "Weight": weight,
            **season_stats,  # Expands seasonal stats dynamically
        }
        scraped_data.append(data)

        print(f"Scraped: {first_name} {last_name} - Position: {position}, Age: {age}, Height: {height}, Weight: {weight}")
        time.sleep(2)  # Delay to avoid getting blocked

    except Exception as e:
        print(f"Error scraping {url}: {e}")

# Close the WebDriver
driver.quit()

# Save to CSV
output_file = "scraped_player_links.csv"
df_output = pd.DataFrame(scraped_data)
df_output.to_csv(output_file, index=False)

print(f"Scraping complete. Data saved to {output_file}")


# Scraped: DESTINY AMINU - Position: PROP, Age: N/A, Height: N/A, Weight: N/A
# Scraped: ENZO AVACA - Position: PROP, Age: N/A, Height: N/A, Weight: N/A
# Scraped: BAUTISTA BERNASCONI - Position: HOOKER, Age: N/A, Height: N/A, Weight: N/A
# Scraped: LORENZO CANNONE - Position: NO. 8, Age: N/A, Height: N/A, Weight: N/A
# Scraped: NICCOLO CANNONE - Position: LOCK, Age: N/A, Height: N/A, Weight: N/A
# Scraped: AGUSTIN CREEVY - Position: HOOKER, Age: N/A, Height: N/A, Weight: N/A

In [None]:
# import time
# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from webdriver_manager.chrome import ChromeDriverManager

# # Load the CSV file containing the player profile URLs
# file_path = "player_url.csv"  # Ensure this file exists and contains URLs
# df = pd.read_csv(file_path)

# # Ensure the column containing URLs is correctly named
# url_column = "Profile Link"  # Change if column name differs
# if url_column not in df.columns:
#     raise ValueError(f"Column '{url_column}' not found in CSV. Available columns: {df.columns}")

# # Configure Selenium WebDriver
# chrome_options = Options()
# # chrome_options.add_argument("--headless")  # Temporarily removed for debugging
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--window-size=1920x1080")
# chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--disable-dev-shm-usage")

# # Setup ChromeDriver
# service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=chrome_options)

# # List to store scraped data
# scraped_data = []

# # Function to wait for an element to load
# def wait_for_element(by, value, timeout=10):
#     try:
#         return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))
#     except:
#         return None

# # Function to scroll the page (ensures full content loads)
# def scroll_page():
#     last_height = driver.execute_script("return document.body.scrollHeight")
#     while True:
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(2)  # Allow time for dynamic content to load
#         new_height = driver.execute_script("return document.body.scrollHeight")
#         if new_height == last_height:
#             break
#         last_height = new_height

# # Loop through each player profile link
# for index, row in df.iterrows():
#     url = row[url_column]
#     try:
#         driver.get(url)

#         # Ensure the page is fully loaded before extracting data
#         wait_for_element(By.TAG_NAME, "body")
#         scroll_page()

#         # Extract First & Last Name
#         first_name = wait_for_element(By.TAG_NAME, "h3")
#         first_name = first_name.text.strip() if first_name else "N/A"

#         last_name = wait_for_element(By.TAG_NAME, "h1")
#         last_name = last_name.text.strip() if last_name else "N/A"

#         # Extract Position
#         position_element = wait_for_element(By.CLASS_NAME, "text-white.border-white")
#         position = position_element.text.strip() if position_element else "N/A"

#         # Extract Age, Height, Weight dynamically
#         age, height, weight = "N/A", "N/A", "N/A"
#         try:
#             stats_list = driver.find_elements(By.CLASS_NAME, "w-1/2.md\\:w-auto")
#             for stat in stats_list:
#                 label = stat.find_element(By.CLASS_NAME, "text-caption").text.strip()
#                 value = stat.find_element(By.CLASS_NAME, "font-urc-sans.text-h2").text.strip()

#                 if "Age" in label.lower():
#                     age = value
#                 elif "Height" in label.lower():
#                     height = value
#                 elif "Weight" in label.lower():
#                     weight = value
#         except:
#             pass

#         # Extract Other Stats (if available)
#         season_stats = {}
#         try:
#             stat_items = driver.find_elements(By.CLASS_NAME, "stat-item")
#             for item in stat_items:
#                 stat_name = item.find_element(By.CLASS_NAME, "stat-label").text.strip()
#                 stat_value = item.find_element(By.CLASS_NAME, "stat-value").text.strip()
#                 season_stats[stat_name] = stat_value
#         except:
#             pass

#         # Store the extracted data
#         data = {
#             "URL": url,
#             "First Name": first_name,
#             "Last Name": last_name,
#             "Position": position,
#             "Age": age,
#             "Height": height,
#             "Weight": weight,
#             **season_stats,  # Expands seasonal stats dynamically
#         }
#         scraped_data.append(data)

#         print(f"Scraped: {first_name} {last_name} - Position: {position}, Age: {age}, Height: {height}, Weight: {weight}")
#         time.sleep(2)  # Delay to avoid blocking

#     except Exception as e:
#         print(f"Error scraping {url}: {e}")

# # Close the WebDriver
# driver.quit()

# # Save to CSV
# output_file = "scraped_players.csv"
# df_output = pd.DataFrame(scraped_data)
# df_output.to_csv(output_file, index=False)

# print(f"Scraping complete. Data saved to {output_file}")


In [None]:
# import time
# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from webdriver_manager.chrome import ChromeDriverManager

# # Load the CSV file containing player profile URLs
# file_path = "player_url.csv"  # Ensure this file exists
# df = pd.read_csv(file_path)

# # Ensure the column containing URLs is correctly named
# url_column = "Profile Link"  # Change this if different in CSV
# if url_column not in df.columns:
#     raise ValueError(f"Column '{url_column}' not found in CSV. Available columns: {df.columns}")

# # Configure Selenium WebDriver
# chrome_options = Options()
# chrome_options.add_argument("--headless")  
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--window-size=1920x1080")
# chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--disable-dev-shm-usage")

# # Setup ChromeDriver
# service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=chrome_options)

# # List to store scraped data
# scraped_data = []

# # Function to wait for an element to load
# def wait_for_element(by, value, timeout=10):
#     try:
#         return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))
#     except:
#         return None

# # Loop through each player profile link
# for index, row in df.iterrows():
#     url = row[url_column]
#     try:
#         driver.get(url)

#         # Ensure the page is fully loaded
#         wait_for_element(By.TAG_NAME, "body")

#         # Extract First & Last Name
#         first_name = wait_for_element(By.TAG_NAME, "h3")
#         first_name = first_name.text.strip() if first_name else "N/A"

#         last_name = wait_for_element(By.TAG_NAME, "h1")
#         last_name = last_name.text.strip() if last_name else "N/A"

#         # Extract Position
#         position_element = wait_for_element(By.CLASS_NAME, "text-white.border-white")
#         position = position_element.text.strip() if position_element else "N/A"

#         # Extract Age, Height, and Weight
#         age, height, weight = "N/A", "N/A", "N/A"
#         try:
#             stats_list = driver.find_elements(By.XPATH, "//ul/li[contains(@class, 'w-1/2') and contains(@class, 'md:w-auto')]")
#             for stat in stats_list:
#                 try:
#                     label = stat.find_element(By.XPATH, ".//p[contains(@class, 'text-caption')]").text.strip()
#                     value = stat.find_element(By.XPATH, ".//p[contains(@class, 'font-urc-sans text-h2')]").text.strip()
                    
#                     if "age" in label.lower():
#                         age = value
#                     elif "height" in label.lower():
#                         height = value
#                     elif "weight" in label.lower():
#                         weight = value
#                 except:
#                     pass
#         except:
#             pass

#         # Store the extracted data
#         data = {
#             "URL": url,
#             "First Name": first_name,
#             "Last Name": last_name,
#             "Position": position,
#             "Age": age,
#             "Height": height,
#             "Weight": weight
#         }
#         scraped_data.append(data)

#         print(f"✅ Scraped: {first_name} {last_name} - Position: {position}, Age: {age}, Height: {height}, Weight: {weight}")

#     except Exception as e:
#         print(f"❌ Error scraping {url}: {e}")

# # Close the WebDriver
# driver.quit()

# # Save to CSV
# output_file = "scraped_players_with_weight.csv"
# df_output = pd.DataFrame(scraped_data)
# df_output.to_csv(output_file, index=False)

# print(f"✅ Scraping complete. Data saved to {output_file}")



# this code works for the player position age height and weight 

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Load the CSV file containing player profile URLs
file_path = "player_url.csv"  # Ensure this file exists
df = pd.read_csv(file_path)

# Ensure the column containing URLs is correctly named
url_column = "Profile Link"  # Change this if different in CSV
if url_column not in df.columns:
    raise ValueError(f"Column '{url_column}' not found in CSV. Available columns: {df.columns}")

# Configure Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Setup ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# List to store scraped data
scraped_data = []

# Function to wait for an element to load
def wait_for_element(by, value, timeout=10):
    try:
        return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))
    except:
        return None

# Loop through each player profile link
for index, row in df.iterrows():
    url = row[url_column]
    try:
        driver.get(url)

        # Ensure the page is fully loaded
        wait_for_element(By.TAG_NAME, "body")

        # Extract First & Last Name
        first_name = wait_for_element(By.TAG_NAME, "h3")
        first_name = first_name.text.strip() if first_name else "N/A"

        last_name = wait_for_element(By.TAG_NAME, "h1")
        last_name = last_name.text.strip() if last_name else "N/A"

        # Extract Position
        position_element = wait_for_element(By.CLASS_NAME, "text-white.border-white")
        position = position_element.text.strip() if position_element else "N/A"

        # Extract Age, Height, and Weight
        age, height, weight = "N/A", "N/A", "N/A"
        try:
            stats_list = driver.find_elements(By.XPATH, "//ul/li[contains(@class, 'w-1/2') and contains(@class, 'md:w-auto')]")
            for stat in stats_list:
                try:
                    label = stat.find_element(By.XPATH, ".//p[contains(@class, 'text-caption')]").text.strip()
                    value = stat.find_element(By.XPATH, ".//p[contains(@class, 'font-urc-sans text-h2')]").text.strip()
                    
                    if "age" in label.lower():
                        age = value
                    elif "height" in label.lower():
                        height = value
                    elif "weight" in label.lower():
                        weight = value
                except:
                    pass
        except:
            pass

        # Extract Stats from sections
        player_stats = {}

        # Loop through each section with expandable content
        try:
            stat_sections = driver.find_elements(By.CLASS_NAME, "expandable-content")
            for section in stat_sections:
                try:
                    stat_items = section.find_elements(By.CLASS_NAME, "flex.justify-between.items-center.p-5")
                    for stat in stat_items:
                        try:
                            label = stat.find_element(By.CLASS_NAME, "text-sm").text.strip()  # Stat label
                            value = stat.find_element(By.CLASS_NAME, "text-2xl").text.strip()  # Stat value
                            player_stats[label] = value
                        except:
                            pass
                except:
                    pass
        except:
            pass

        # Store the extracted data
        data = {
            "URL": url,
            "First Name": first_name,
            "Last Name": last_name,
            "Position": position,
            "Age": age,
            "Height": height,
            "Weight": weight,
            **player_stats  # Expands seasonal stats dynamically
        }
        scraped_data.append(data)

        print(f"Scraped: {first_name} {last_name} - Position: {position}, Age: {age}, Height: {height}, Weight: {weight}")

    except Exception as e:
        print(f"Error scraping {url}: {e}")

# Close the WebDriver
driver.quit()

# Save to CSV
output_file = "scraped_players_full_stats.csv"
df_output = pd.DataFrame(scraped_data)
df_output.to_csv(output_file, index=False)

print(f"Scraping complete. Data saved to {output_file}")



# this takes 28 minutes to finish scraping it only scrapes physical stats and not the season stats


✅ Scraped: DESTINY AMINU - Position: PROP, Age: 21, Height: 6'1'', Weight: 122KG, Stats: {'': ''}
✅ Scraped: ENZO AVACA - Position: PROP, Age: 24, Height: 6'2'', Weight: 118KG, Stats: {'': ''}
✅ Scraped: BAUTISTA BERNASCONI - Position: HOOKER, Age: 23, Height: 5'9'', Weight: 113KG, Stats: {'': ''}
✅ Scraped: LORENZO CANNONE - Position: NO. 8, Age: 24, Height: 6'3'', Weight: 102KG, Stats: {'': ''}
✅ Scraped: NICCOLO CANNONE - Position: LOCK, Age: 26, Height: 6'6'', Weight: 120KG, Stats: {'': ''}
✅ Scraped: AGUSTIN CREEVY - Position: HOOKER, Age: 39, Height: 6'0'', Weight: 106KG, Stats: {'': ''}
✅ Scraped: RICCARDO FAVRETTO - Position: LOCK, Age: 23, Height: 6'7'', Weight: 105KG, Stats: {'': ''}
✅ Scraped: SIMONE FERRARI - Position: PROP, Age: 30, Height: 6'0'', Weight: 120KG, Stats: {'': ''}
✅ Scraped: THOMAS GALLO - Position: PROP, Age: 25, Height: 5'9'', Weight: 107KG, Stats: {'': ''}
✅ Scraped: MARCOS GALLORINI - Position: PROP, Age: 20, Height: 6'3'', Weight: 135KG, Stats: {'': ''}


In [None]:
# List of URLs to scrape
urls = [
 "https://www.unitedrugby.com/match-centre/2024-25/1",
 "https://www.unitedrugby.com/match-centre/2024-25/2",
 "https://www.unitedrugby.com/match-centre/2024-25/3",
 "https://www.unitedrugby.com/match-centre/2024-25/4",
 "https://www.unitedrugby.com/match-centre/2024-25/5",
 "https://www.unitedrugby.com/match-centre/2024-25/6",
 "https://www.unitedrugby.com/match-centre/2024-25/7",
 "https://www.unitedrugby.com/match-centre/2024-25/8",
 "https://www.unitedrugby.com/match-centre/2024-25/9",
 "https://www.unitedrugby.com/match-centre/2024-25/10",
 "https://www.unitedrugby.com/match-centre/2024-25/11",
 "https://www.unitedrugby.com/match-centre/2024-25/12", 
 "https://www.unitedrugby.com/match-centre/2024-25/13",
 "https://www.unitedrugby.com/match-centre/2024-25/14",
 "https://www.unitedrugby.com/match-centre/2024-25/15",
 "https://www.unitedrugby.com/match-centre/2024-25/16",
 "https://www.unitedrugby.com/match-centre/2024-25/17",
 "https://www.unitedrugby.com/match-centre/2024-25/18", 
 "https://www.unitedrugby.com/match-centre/2024-25/19", 
 "https://www.unitedrugby.com/match-centre/2024-25/20", 
 "https://www.unitedrugby.com/match-centre/2024-25/21"
]

# Headers to mimic a real browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}


In [2]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# List of match centre URLs (Rounds 1-21)
urls = [f"https://www.unitedrugby.com/match-centre/2024-25/{i}" for i in range(1, 22)]

# Configure Selenium WebDriver
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Set to False if you want to see browser actions
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Setup ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Function to accept cookies
def accept_cookies():
    try:
        cookies_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept All Cookies')]"))
        )
        cookies_button.click()
        print("✅ Cookies accepted.")
        time.sleep(2)  # Give it time to disappear
    except:
        print("⚠️ No cookie popup found or already accepted.")

# Function to scroll the page **from top to bottom** to load all matches
def scroll_to_load():
    last_height = 0
    for _ in range(10):  # Scroll up to 10 times
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Give time for matches to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:  # Stop if no new content loads
            break
        last_height = new_height

# Extract match details for each URL
matches = []

for url in urls:
    print(f"🔍 Scraping: {url}")
    driver.get(url)

    # Accept cookies (only on the first page)
    if urls.index(url) == 0:
        accept_cookies()

    # Wait until matches load
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, "fixture-card-in-focus"))
        )
    except:
        print(f"⏳ Timeout: No matches found on {url}")
        continue  # Skip this page if no matches load

    # **SCROLL FROM TOP TO BOTTOM BEFORE SCRAPING**
    driver.execute_script("window.scrollTo(0, 0);")  # Start from top
    scroll_to_load()  # Scroll and wait for all matches to load

    # Find all match elements
    match_elements = driver.find_elements(By.CLASS_NAME, "fixture-card-in-focus")
    print(f"✅ Found {len(match_elements)} matches on {url}")

    for match in match_elements:
        try:
            # Extract match date
            date = match.find_element(By.CLASS_NAME, "fixture-sets").text.strip() if match.find_elements(By.CLASS_NAME, "fixture-sets") else "N/A"
            
            # Extract match URL and parse teams and date
            match_link_element = match.find_elements(By.TAG_NAME, "a")
            match_link = match_link_element[0].get_attribute("href") if match_link_element else "N/A"

            # Split the URL to extract Team 1, Team 2, and Match Date
            if "united-rugby-championship" in match_link:
                parts = match_link.split("united-rugby-championship/")[1].split("-vs-")
                if len(parts) == 2:
                    team1 = parts[0].replace("-", " ").title()  # Convert team names to proper format
                    remaining = parts[1].split("-")
                    team2 = " ".join(remaining[:-3]).title()  # Extract team2 (before date)
                    match_date = "-".join(remaining[-3:]).split("/")[0]  # Extract date, removing match ID
                else:
                    team1, team2, match_date = "N/A", "N/A", "N/A"
            else:
                team1, team2, match_date = "N/A", "N/A", "N/A"

            # Extract match status (e.g., Full Time, Upcoming, Live)
            match_status = match.find_element(By.TAG_NAME, "time").text.strip() if match.find_elements(By.TAG_NAME, "time") else "N/A"

            matches.append({
                "Round URL": url,
                "Match Date": match_date,
                "Team 1": team1,
                "Team 2": team2,
                "Match Status": match_status,
                "Match URL": match_link
            })
        
        except Exception as e:
            print(f"❌ Error extracting match: {e}")

# Close the WebDriver
driver.quit()

# Convert to DataFrame
df_matches = pd.DataFrame(matches)

# Display in Jupyter Notebook
from IPython.display import display
display(df_matches)


🔍 Scraping: https://www.unitedrugby.com/match-centre/2024-25/1
⚠️ No cookie popup found or already accepted.
✅ Found 1 matches on https://www.unitedrugby.com/match-centre/2024-25/1
🔍 Scraping: https://www.unitedrugby.com/match-centre/2024-25/2
✅ Found 5 matches on https://www.unitedrugby.com/match-centre/2024-25/2
🔍 Scraping: https://www.unitedrugby.com/match-centre/2024-25/3
✅ Found 8 matches on https://www.unitedrugby.com/match-centre/2024-25/3
🔍 Scraping: https://www.unitedrugby.com/match-centre/2024-25/4
✅ Found 1 matches on https://www.unitedrugby.com/match-centre/2024-25/4
🔍 Scraping: https://www.unitedrugby.com/match-centre/2024-25/5
✅ Found 1 matches on https://www.unitedrugby.com/match-centre/2024-25/5
🔍 Scraping: https://www.unitedrugby.com/match-centre/2024-25/6
✅ Found 1 matches on https://www.unitedrugby.com/match-centre/2024-25/6
🔍 Scraping: https://www.unitedrugby.com/match-centre/2024-25/7
✅ Found 1 matches on https://www.unitedrugby.com/match-centre/2024-25/7
🔍 Scrapin

Unnamed: 0,Round URL,Match Date,Team 1,Team 2,Match Status,Match URL
0,https://www.unitedrugby.com/match-centre/2024-...,2024-09-20,Edinburgh Rugby,Leinster Rugby,,https://www.unitedrugby.com/match-centre/20240...
1,https://www.unitedrugby.com/match-centre/2024-...,2024-09-27,Leinster Rugby,Dragons Rfc,FULL TIME,https://www.unitedrugby.com/match-centre/20240...
2,https://www.unitedrugby.com/match-centre/2024-...,2024-09-28,Zebre Parma,Munster Rugby,FULL TIME,https://www.unitedrugby.com/match-centre/20240...
3,https://www.unitedrugby.com/match-centre/2024-...,2024-09-28,Scarlets,Cardiff Rugby,FULL TIME,https://www.unitedrugby.com/match-centre/20240...
4,https://www.unitedrugby.com/match-centre/2024-...,2024-09-28,Connacht Rugby,Hollywoodbets Sharks,FULL TIME,https://www.unitedrugby.com/match-centre/20240...
5,https://www.unitedrugby.com/match-centre/2024-...,2024-09-28,Ospreys,Dhl Stormers,FULL TIME,https://www.unitedrugby.com/match-centre/20240...
6,https://www.unitedrugby.com/match-centre/2024-...,2024-10-04,Scarlets,Connacht Rugby,FULL TIME,https://www.unitedrugby.com/match-centre/20240...
7,https://www.unitedrugby.com/match-centre/2024-...,2024-10-04,Cardiff Rugby,Glasgow Warriors,FULL TIME,https://www.unitedrugby.com/match-centre/20240...
8,https://www.unitedrugby.com/match-centre/2024-...,2024-10-05,Emirates Lions,Edinburgh Rugby,FULL TIME,https://www.unitedrugby.com/match-centre/20240...
9,https://www.unitedrugby.com/match-centre/2024-...,2024-10-05,Vodacom Bulls,Ulster Rugby,FULL TIME,https://www.unitedrugby.com/match-centre/20240...


In [3]:
# Re-import necessary libraries
import pandas as pd

# Data including past fixtures with TV coverage and referee details
past_fixtures = [
    ["2024-09-20", "Ulster", 20, "Glasgow Warriors", 19, "BBC Northern Ireland", "Andrew Brace"],
    ["2024-09-21", "Dragons", 23, "Ospreys", 21, "Premier Sports", "Adam Jones"],
    ["2024-09-21", "Munster", 35, "Connacht", 33, "TG4", "Frank Murphy"],
    ["2024-09-21", "Benetton", 20, "Scarlets", 20, "Premier Sports", "Gianluca Gnecchi"],
    ["2024-09-27", "Glasgow Warriors", 42, "Benetton", 10, "BBC Scotland", "Ben Whitehouse"],
    ["2024-09-27", "Leinster", 34, "Dragons", 6, "TG4", "Chris Busby"],
    ["2024-09-28", "Lions", 35, "Ulster", 22, "SuperSport", "Jaco Peyper"],
    ["2024-09-28", "Bulls", 22, "Edinburgh", 16, "SuperSport", "Marius van der Westhuizen"],
    ["2024-09-28", "Zebre Parma", 42, "Munster", 33, "Premier Sports", "Andrea Piardi"],
    ["2024-10-04", "Scarlets", 23, "Connacht", 24, "S4C", "Craig Evans"],
    ["2024-10-04", "Cardiff", 36, "Glasgow Warriors", 52, "BBC Wales", "Dan Jones"],
    ["2024-10-05", "Lions", 55, "Edinburgh", 21, "SuperSport", "AJ Jacobs"],
    ["2024-10-05", "Bulls", 47, "Ulster", 21, "SuperSport", "Marius Jonker"],
    ["2024-10-05", "Benetton", 5, "Leinster", 35, "Premier Sports", "Mike Adamson"],
    ["2024-10-05", "Dragons", 30, "Sharks", 33, "Premier Sports", "Andrew Brace"],
    ["2024-10-05", "Zebre", 5, "Stormers", 36, "Premier Sports", "Chris Busby"],
    ["2024-10-05", "Munster", 23, "Ospreys", 0, "TG4", "Frank Murphy"],
    ["2024-12-28", "Stormers", 24, "Sharks", 20, "SuperSport", "Marius van der Westhuizen"],
    ["2025-02-28", "Munster", 28, "Edinburgh", 34, "TG4", "Andrea Piardi"],
    ["2025-02-28", "Zebre", 31, "Dragons", 21, "Premier Sports", "Chris Busby"],
]

# Future fixtures without scores yet
future_fixtures = [
    ["2025-03-22", "Scarlets", None, "Stormers", None, "S4C", "To Be Announced"],
    ["2025-03-29", "Scarlets", None, "Ospreys", None, "S4C", "To Be Announced"],
    ["2025-04-18", "Edinburgh", None, "Sharks", None, "BBC Scotland", "To Be Announced"],
    ["2025-04-19", "Lions", None, "Benetton", None, "SuperSport", "To Be Announced"],
    ["2025-04-19", "Stormers", None, "Connacht", None, "SuperSport", "To Be Announced"],
    ["2025-04-19", "Ospreys", None, "Cardiff", None, "S4C", "To Be Announced"],
    ["2025-04-19", "Munster", None, "Bulls", None, "TG4", "To Be Announced"],
    ["2025-04-19", "Dragons", None, "Scarlets", None, "Premier Sports", "To Be Announced"],
    ["2025-04-19", "Zebre", None, "Glasgow Warriors", None, "Premier Sports", "To Be Announced"],
    ["2025-04-19", "Leinster", None, "Ulster", None, "TG4", "To Be Announced"],
    ["2025-04-26", "Scarlets", None, "Leinster", None, "S4C", "To Be Announced"],
    ["2025-05-11", "Lions", None, "Scarlets", None, "SuperSport", "To Be Announced"],
    ["2025-05-16", "Stormers", None, "Cardiff", None, "SuperSport", "To Be Announced"],
    ["2025-05-16", "Edinburgh", None, "Ulster", None, "BBC Scotland", "To Be Announced"],
    ["2025-05-16", "Munster", None, "Benetton", None, "TG4", "To Be Announced"],
    ["2025-05-17", "Bulls", None, "Dragons", None, "SuperSport", "To Be Announced"],
    ["2025-05-17", "Lions", None, "Ospreys", None, "SuperSport", "To Be Announced"],
    ["2025-05-17", "Zebre", None, "Connacht", None, "Premier Sports", "To Be Announced"],
    ["2025-05-17", "Sharks", None, "Scarlets", None, "SuperSport", "To Be Announced"],
    ["2025-05-17", "Leinster", None, "Glasgow Warriors", None, "TG4", "To Be Announced"],
]

# Convert to DataFrames
df_past = pd.DataFrame(past_fixtures, columns=["Date", "Home Team", "Home Score", "Away Team", "Away Score", "TV Coverage", "Referee"])
df_future = pd.DataFrame(future_fixtures, columns=["Date", "Home Team", "Home Score", "Away Team", "Away Score", "TV Coverage", "Referee"])

# Combine past and future fixtures
df = pd.concat([df_past, df_future], ignore_index=True)

# Save updated CSV file
csv_filename_updated = "URC_2024_25_Fixtures_Results_Updated.csv"
df.to_csv(csv_filename_updated, index=False)

# Provide updated file link
csv_filename_updated


'URC_2024_25_Fixtures_Results_Updated.csv'

In [7]:
dummy_stats_file_path = "data/dummy_all_players_stats.csv"
df_dummy_stats = pd.read_csv(dummy_stats_file_path)

# Selecting relevant match stats columns
match_stats_columns = [
    "points_scored", "tries_scored", "tackles_made", "total_tackles_missed",
    "turnovers_won", "turnovers_lost", "lineouts_won", "lineouts_lost",
    "scrums_won", "scrums_lost", "penalties_conceded"
]

# Grouping by club to get aggregated match stats
df_match_stats = df_dummy_stats.groupby("club")[match_stats_columns].sum().reset_index()

# Save match stats to CSV
match_stats_csv_path = "data/URC_2024_25_Match_Stats.csv"
df_match_stats.to_csv(match_stats_csv_path, index=False)

# Create current table standings based on aggregated points scored
df_table_standings = df_match_stats[["club", "points_scored"]].rename(columns={"club": "Team", "points_scored": "Points"})
df_table_standings = df_table_standings.sort_values(by="Points", ascending=False).reset_index(drop=True)

# Add placeholder columns for Wins, Draws, and Losses (these require match results for accuracy)
df_table_standings["Wins"] = None
df_table_standings["Draws"] = None
df_table_standings["Losses"] = None

# Save table standings to CSV
table_standings_csv_path = "data/URC_2024_25_Table_Standings.csv"
df_table_standings.to_csv(table_standings_csv_path, index=False)

# Provide file links for download
match_stats_csv_path, table_standings_csv_path


('data/URC_2024_25_Match_Stats.csv', 'data/URC_2024_25_Table_Standings.csv')

In [9]:
import pandas as pd

# Define the table standings data
table_data = [
    ["Leinster", 12, 12, 0, 0, 194, 9, 57],
    ["Glasgow Warriors", 12, 8, 0, 4, 129, 12, 44],
    ["Bulls", 12, 8, 0, 4, 67, 9, 41],
    ["Sharks", 12, 8, 0, 4, 47, 7, 39],
    ["Munster", 12, 6, 0, 6, -7, 9, 33],
    ["Cardiff Rugby", 12, 5, 1, 6, -58, 9, 31],
    ["Edinburgh", 12, 5, 0, 7, -5, 10, 30],
    ["Lions", 12, 6, 0, 6, 11, 5, 29],
    ["Connacht", 12, 5, 0, 7, -19, 9, 29],
    ["Scarlets", 12, 5, 1, 6, 16, 6, 28],
    ["Ospreys", 12, 5, 1, 6, 0, 6, 28],
    ["Stormers", 12, 5, 0, 7, -21, 8, 28],
    ["Benetton", 12, 5, 1, 6, -66, 6, 28],
    ["Ulster", 12, 5, 0, 7, -25, 7, 27],
    ["Zebre", 12, 5, 0, 7, -72, 5, 25],
    ["Dragons", 12, 1, 0, 11, -148, 4, 8]
]

# Create DataFrame for the updated table standings
df_updated_standings = pd.DataFrame(
    table_data, 
    columns=["Team", "P", "W", "D", "L", "PD", "B", "Pts"]
)

# Save to CSV file
table_standings_csv_path = "data/URC_2024_25_Table_Standings.csv"
df_updated_standings.to_csv(table_standings_csv_path, index=False)

# Output file path
print(f"Table standings saved to: {table_standings_csv_path}")


Table standings saved to: data/URC_2024_25_Table_Standings.csv


In [None]:
import React, { useState } from 'react';

const HomePage = () => {
  // State for currently selected league and year
  const [selectedLeague, setSelectedLeague] = useState('urc');
  const [selectedYear, setSelectedYear] = useState('2023');

  // League dropdown options
  const leagueOptions = [
    { value: 'urc', label: 'URC (United Rugby Championship)' },
    { value: 'premiership', label: 'Premiership (English)' },
    { value: 'top14', label: 'Top 14 (French)' },
  ];

  // Example year options
  const yearOptions = ['2022', '2023', '2024', '2025'];

  // Example data for fixtures/results by league/year
  const fixturesData = {
    urc: {
      '2022': ['URC Match 2022-1', 'URC Match 2022-2'],
      '2023': ['URC Match 2023-1', 'URC Match 2023-2'],
      '2024': ['URC Match 2024-1', 'URC Match 2024-2'],
      '2025': ['URC Match 2025-1', 'URC Match 2025-2'],
    },
    premiership: {
      '2022': ['Premiership Match 2022-1', 'Premiership Match 2022-2'],
      '2023': ['Premiership Match 2023-1', 'Premiership Match 2023-2'],
      '2024': ['Premiership Match 2024-1', 'Premiership Match 2024-2'],
      '2025': ['Premiership Match 2025-1', 'Premiership Match 2025-2'],
    },
    top14: {
      '2022': ['Top 14 Match 2022-1', 'Top 14 Match 2022-2'],
      '2023': ['Top 14 Match 2023-1', 'Top 14 Match 2023-2'],
      '2024': ['Top 14 Match 2024-1', 'Top 14 Match 2024-2'],
      '2025': ['Top 14 Match 2025-1', 'Top 14 Match 2025-2'],
    },
  };

  // Example data for league tables by league
  const leagueTables = {
    urc: ['URC Team A', 'URC Team B', 'URC Team C'],
    premiership: ['Premiership Team A', 'Premiership Team B', 'Premiership Team C'],
    top14: ['Top 14 Team A', 'Top 14 Team B', 'Top 14 Team C'],
  };

  // Handlers for dropdown changes
  const handleLeagueChange = (event) => {
    setSelectedLeague(event.target.value);
    // Optionally reset the year if desired:
    // setSelectedYear('2023');
  };

  const handleYearChange = (event) => {
    setSelectedYear(event.target.value);
  };

  return (
    <div style={{ fontFamily: 'sans-serif' }}>
      {/* HEADER */}
      <header style={{ backgroundColor: '#f88', padding: '1rem' }}>
        <nav style={{ display: 'flex', gap: '1rem' }}>
          <a href="#home">Home</a>
          <a href="#teams">Teams</a>
          <a href="#about">About</a>
        </nav>
      </header>

      {/* MAIN CONTENT */}
      <main style={{ padding: '1rem' }}>
        {/* 1) NEWS SECTION */}
        <section
          style={{
            border: '1px solid #ccc',
            padding: '1rem',
            marginBottom: '1rem',
          }}
        >
          <h2>News</h2>
          <p>
            {/* Replace with actual rugby news or a dedicated component */}
            Latest news about the URC, Premiership, and Top 14...
          </p>
        </section>

        {/* 2) FIXTURES & RESULTS SECTION */}
        <section
          style={{
            border: '1px solid #ccc',
            padding: '1rem',
            marginBottom: '1rem',
          }}
        >
          <h2>Fixtures & Results</h2>
          <div>
            <label htmlFor="yearSelect">Select Year: </label>
            <select
              id="yearSelect"
              value={selectedYear}
              onChange={handleYearChange}
            >
              {yearOptions.map((year) => (
                <option key={year} value={year}>
                  {year}
                </option>
              ))}
            </select>
          </div>
          <ul>
            {fixturesData[selectedLeague][selectedYear].map((match, index) => (
              <li key={index}>{match}</li>
            ))}
          </ul>
        </section>

        {/* 3) LEAGUE TABLE SECTION */}
        <section
          style={{
            border: '1px solid #ccc',
            padding: '1rem',
            marginBottom: '1rem',
          }}
        >
          <h2>League Table</h2>
          <div>
            <label htmlFor="leagueSelect">Select League: </label>
            <select
              id="leagueSelect"
              value={selectedLeague}
              onChange={handleLeagueChange}
            >
              {leagueOptions.map((league) => (
                <option key={league.value} value={league.value}>
                  {league.label}
                </option>
              ))}
            </select>
          </div>
          <ul>
            {leagueTables[selectedLeague].map((team, index) => (
              <li key={index}>{team}</li>
            ))}
          </ul>
        </section>
      </main>

      {/* FOOTER */}
      <footer style={{ backgroundColor: '#f88', padding: '1rem' }}>
        <p>&copy; 2025 Your Rugby Website</p>
      </footer>
    </div>
  );
};

export default HomePage;
