# Selenium Functions Notes
This notebook summarizes the main Selenium functions/methods, what they do, and examples of usage.

## 1. Setting up Selenium
- `webdriver.Chrome()` or `webdriver.Firefox()` initializes a browser session.

In [13]:
from selenium import webdriver
driver = webdriver.Chrome(executable_path='chromedriver.exe')  # Open Chrome browser
driver.get('https://example.com')  # Navigate to a URL

TypeError: WebDriver.__init__() got an unexpected keyword argument 'executable_path'

## 2. Locating Elements
- `find_element` finds the first matching element
- `find_elements` finds all matching elements
- `By.ID`, `By.CLASS_NAME`, `By.NAME`, `By.TAG_NAME`, `By.CSS_SELECTOR`, `By.XPATH` specify search methods

In [None]:
from selenium.webdriver.common.by import By

# Find single element
element = driver.find_element(By.ID, 'searchInput')
print(element.tag_name)

# Find multiple elements
all_links = driver.find_elements(By.TAG_NAME, 'a')
print(len(all_links))

## 3. Interacting with Elements
- `send_keys()` types into input boxes
- `click()` clicks buttons or links
- `clear()` clears text fields

In [None]:
search_box = driver.find_element(By.NAME, 'q')
search_box.send_keys('Selenium tutorial')  # Type text
search_box.clear()  # Clear text
search_box.send_keys('Python Selenium')
search_box.submit()  # Submit form

## 4. Navigation
- `driver.get(url)` open a page
- `driver.back()` go back
- `driver.forward()` go forward
- `driver.refresh()` refresh page

In [None]:
driver.get('https://books.toscrape.com/')
driver.back()
driver.forward()
driver.refresh()

## 9. Scraping FBref Premier League Table

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import csv
import time
import os

# Initialize driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://fbref.com/en/comps/9/2024-2025/2024-2025-Premier-League-Stats'
driver.get(url)
time.sleep(5)

# Get all team links from the league table
team_links_elements = driver.find_elements(By.CSS_SELECTOR, 'table#results2024-202591_overall a[href*="/en/squads/"]')
teams = []

for link in team_links_elements:
    team_name = link.text.strip()
    team_url = link.get_attribute('href')
    if team_name and team_url:
        teams.append((team_name, team_url))

print(f'Found {len(teams)} teams\n')

# Scrape each team's stats and matches
for team_name, team_url in teams:
    # Create directory for team
    os.makedirs(team_name, exist_ok=True)
    
    # Navigate to team page
    driver.get(team_url)
    time.sleep(3)

    # ===== SCRAPE PLAYER STATS =====
    try:
        stats_table = driver.find_element(By.ID, 'stats_standard_9')

        # Extract headers (first 5 columns only)
        header_row = stats_table.find_element(By.CSS_SELECTOR, 'thead tr:last-child')
        all_headers = header_row.find_elements(By.TAG_NAME, 'th')
        headers = [th.text.strip() for th in all_headers[:5]]

        # Extract body rows (first 5 columns only)
        rows = []
        body_trs = stats_table.find_elements(By.CSS_SELECTOR, 'tbody tr')
        
        for tr in body_trs:
            # Skip separator rows
            tr_class = tr.get_attribute('class') or ''
            if 'thead' in tr_class:
                continue
            
            # Get first 5 cells from this row
            cells = tr.find_elements(By.XPATH, './th | ./td')
            row = [cell.text.strip() for cell in cells[:17]]
            rows.append(row)

        # Write to CSV
        csv_path = os.path.join(team_name, f'{team_name}_stats.csv')
        with open(csv_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(headers)
            writer.writerows(rows)

        print(f'✓ Saved {team_name} stats')
    
    except Exception as e:
        print(f'✗ Error scraping {team_name} stats: {e}')

    # ===== SCRAPE MATCHES =====
    try:
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.ID, "matchlogs_for")))
        
        matches_table = driver.find_element(By.ID, "matchlogs_for")
        
        # Extract headers
        column_headers = matches_table.find_elements(By.CSS_SELECTOR, "thead tr th")
        headers_match = [h.text.strip() for h in column_headers][:-2]
        headers_match.insert(0, "Team")
        
        # Extract matches rows
        matches_rows = matches_table.find_elements(By.CSS_SELECTOR, "tbody tr")
        
        matches_data = []
        for row in matches_rows:
            try:
                match_date = row.find_element(By.TAG_NAME, "th").text.strip()
                cells = row.find_elements(By.TAG_NAME, "td")
                row_data = [team_name, match_date] + [cell.text.strip() for cell in cells[:-2]]
                matches_data.append(row_data)
            except:
                continue
        
        # Write matches to CSV
        matches_csv_path = os.path.join(team_name, f'{team_name}_matches.csv')
        with open(matches_csv_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(headers_match)
            writer.writerows(matches_data)
        
        print(f'✓ Saved {team_name} matches ({len(matches_data)} matches)')
    
    except Exception as e:
        print(f'✗ Error scraping {team_name} matches: {e}')
    
    print()

driver.quit()
print('Scraping completed!')

Found 20 teams

✓ Saved Liverpool stats
✓ Saved Liverpool stats
✓ Saved Liverpool matches (58 matches)

✓ Saved Liverpool matches (58 matches)

✓ Saved Arsenal stats
✓ Saved Arsenal stats
✓ Saved Arsenal matches (60 matches)

✓ Saved Arsenal matches (60 matches)

✓ Saved Manchester City stats
✓ Saved Manchester City stats
✓ Saved Manchester City matches (59 matches)

✓ Saved Manchester City matches (59 matches)

✓ Saved Chelsea stats
✓ Saved Chelsea stats
✓ Saved Chelsea matches (59 matches)

✓ Saved Chelsea matches (59 matches)

✓ Saved Newcastle Utd stats
✓ Saved Newcastle Utd stats
✓ Saved Newcastle Utd matches (50 matches)

✓ Saved Newcastle Utd matches (50 matches)

✓ Saved Aston Villa stats
✓ Saved Aston Villa stats
✓ Saved Aston Villa matches (59 matches)

✓ Saved Aston Villa matches (59 matches)

✓ Saved Nott'ham Forest stats
✓ Saved Nott'ham Forest stats
✓ Saved Nott'ham Forest matches (45 matches)

✓ Saved Nott'ham Forest matches (45 matches)

✓ Saved Brighton stats
✓ Saved B

In [5]:
from sqlalchemy import create_engine
import pandas as pd
import os
import numpy as np

# ===== DATABASE CONNECTION =====
print("Connecting to PostgreSQL database...\n")

DATABASE_URL = "postgresql://postgres:Ren-ji24@localhost:5432/foot_ball"

try:
    engine = create_engine(DATABASE_URL)
    connection = engine.connect()
    print("✓ Connected to foot_ball database\n")
    connection.close()
except Exception as e:
    print(f"✗ Connection error: {e}")
    exit()

# ===== LOAD ALL TEAM DATA =====
print("Loading all team data from directories...\n")

all_players = []
all_matches = []

team_dirs = [d for d in os.listdir('data') if os.path.isdir(os.path.join('data', d))]

for team_name in team_dirs:
    team_path = os.path.join('data', team_name)
    
    # Load player stats
    stats_file = os.path.join(team_path, f'{team_name}_stats.csv')
    if os.path.exists(stats_file):
        try:
            df_stats = pd.read_csv(stats_file)
            df_stats['team'] = team_name
            all_players.append(df_stats)
            print(f"  ✓ Loaded {team_name} stats")
        except Exception as e:
            print(f"  ✗ Error loading {team_name} stats: {e}")
    
    # Load matches
    matches_file = os.path.join(team_path, f'{team_name}_matches.csv')
    if os.path.exists(matches_file):
        try:
            df_matches = pd.read_csv(matches_file)
            all_matches.append(df_matches)
            print(f"  ✓ Loaded {team_name} matches")
        except Exception as e:
            print(f"  ✗ Error loading {team_name} matches: {e}")

# Combine all data
players_df = pd.concat(all_players, ignore_index=True) if all_players else pd.DataFrame()
matches_df = pd.concat(all_matches, ignore_index=True) if all_matches else pd.DataFrame()

print(f"\nTotal: {len(players_df)} players, {len(matches_df)} matches\n")

# ===== NETTOYAGE ET TRANSFORMATION =====
print("Cleaning and transforming data...\n")

# --- PLAYERS CLEANING ---
print("Cleaning players data...")
# Remove duplicates
players_df.drop_duplicates(inplace=True)
# Remove completely empty rows
players_df = players_df.dropna(how='all')
# Standardize column names
players_df.columns = players_df.columns.str.strip().str.replace(' ', '_').str.lower()
# Remove rows where Player name is missing
players_df = players_df.dropna(subset=['player'])

print(f"  ✓ Players after cleaning: {len(players_df)}\n")

# --- MATCHES CLEANING ---
print("Cleaning matches data...")
# Remove duplicates
matches_df.drop_duplicates(inplace=True)
# Remove completely empty rows
matches_df = matches_df.dropna(how='all')
# Standardize column names
matches_df.columns = matches_df.columns.str.strip().str.replace(' ', '_').str.lower()
# Convert date column to datetime
if 'date' in matches_df.columns:
    matches_df['date'] = pd.to_datetime(matches_df['date'], errors='coerce')
# Remove rows where date is missing
matches_df = matches_df.dropna(subset=['date'])
# Convert numeric columns
numeric_cols = ['gf', 'ga', 'xg', 'xga', 'poss']
for col in numeric_cols:
    if col in matches_df.columns:
        matches_df[col] = pd.to_numeric(matches_df[col], errors='coerce')

print(f"  ✓ Matches after cleaning: {len(matches_df)}\n")

# ===== INSERT TEAMS =====
print("Inserting teams...\n")
teams_df = players_df[['team']].drop_duplicates().rename(columns={'team': 'nomequipe'})
teams_df['idcompetition'] = 1
teams_df['idsaison'] = 1

try:
    teams_df.to_sql('equipe', engine, if_exists='append', index=False)
    print(f"✓ Inserted {len(teams_df)} teams\n")
except Exception as e:
    print(f"Note: Teams may already exist: {e}\n")

# ===== INSERT PLAYERS =====
print("Inserting players...\n")

# Get team IDs from database
with engine.connect() as conn:
    teams_query = pd.read_sql("SELECT idequipe, nomequipe FROM equipe", conn)
    team_mapping = dict(zip(teams_query['nomequipe'], teams_query['idequipe']))

# Prepare players data
players_insert = players_df.copy()
players_insert['id_equipe'] = players_insert['team'].map(team_mapping)
players_insert = players_insert.rename(columns={
    'player': 'nomjoueur',
    'pos': 'position',
    'nation': 'nationalite'
})
players_insert = players_insert[['nomjoueur', 'position', 'nationalite', 'id_equipe']].dropna(subset=['nomjoueur'])

try:
    players_insert.to_sql('joueur', engine, if_exists='append', index=False)
    print(f"✓ Inserted {len(players_insert)} players\n")
except Exception as e:
    print(f"Error inserting players: {e}\n")

# ===== INSERT MATCHES =====
print("Inserting matches...\n")

# Prepare matches data
matches_insert = matches_df.copy()
matches_insert['idteamhome'] = matches_insert['team'].map(team_mapping)
matches_insert['id_competition'] = 1
matches_insert['id_saison'] = 1
matches_insert = matches_insert.rename(columns={
    'date': 'date_match',
    'time': 'heure',
    'round': 'round_match',
    'result': 'resultat',
    'venue': 'venue'
})

# Select only required columns
required_cols = ['date_match', 'heure', 'round_match', 'venue', 'idteamhome', 'id_competition', 'id_saison', 'resultat']
available_cols = [col for col in required_cols if col in matches_insert.columns]
matches_insert = matches_insert[available_cols].dropna(subset=['date_match'])

try:
    matches_insert.to_sql('match', engine, if_exists='append', index=False)
    print(f"✓ Inserted {len(matches_insert)} matches\n")
except Exception as e:
    print(f"Error inserting matches: {e}\n")

# ===== VERIFY DATA =====
print("Verifying inserted data...\n")

with engine.connect() as conn:
    teams_count = pd.read_sql("SELECT COUNT(*) as count FROM equipe", conn)
    players_count = pd.read_sql("SELECT COUNT(*) as count FROM joueur", conn)
    matches_count = pd.read_sql("SELECT COUNT(*) as count FROM match", conn)
    
    print(f"Teams in database: {teams_count['count'][0]}")
    print(f"Players in database: {players_count['count'][0]}")
    print(f"Matches in database: {matches_count['count'][0]}")

print("\n✓ Data loading and transformation completed!")

Connecting to PostgreSQL database...

✓ Connected to foot_ball database

Loading all team data from directories...

  ✓ Loaded Arsenal stats
  ✓ Loaded Arsenal matches
  ✓ Loaded Aston Villa stats
  ✓ Loaded Aston Villa matches
  ✓ Loaded Bournemouth stats
  ✓ Loaded Bournemouth matches
  ✓ Loaded Brentford stats
  ✓ Loaded Brentford matches
  ✓ Loaded Brighton stats
  ✓ Loaded Brighton matches
  ✓ Loaded Chelsea stats
  ✓ Loaded Chelsea matches
  ✓ Loaded Crystal Palace stats
  ✓ Loaded Crystal Palace matches
  ✓ Loaded Everton stats
  ✓ Loaded Everton matches
  ✓ Loaded Fulham stats
  ✓ Loaded Fulham matches
  ✓ Loaded Ipswich Town stats
  ✓ Loaded Ipswich Town matches
  ✓ Loaded Leicester City stats
  ✓ Loaded Leicester City matches
  ✓ Loaded Liverpool stats
  ✓ Loaded Liverpool matches
  ✓ Loaded Manchester City stats
  ✓ Loaded Manchester City matches
  ✓ Loaded Manchester Utd stats
  ✓ Loaded Manchester Utd matches
  ✓ Loaded Newcastle Utd stats
  ✓ Loaded Newcastle Utd matches
