In [116]:
# -------------------------------------------------------------
# CONFIGURATION - Must update this section with values you want.
# -------------------------------------------------------------
DB_PATH = "db/Track.db"
YEAR = 2025  # valid options: 2023, 2024, 2025
GENDER = "Girls"  # valid options: Girls, Boys
MEET_TYPE_LIST = ["State"]  # valid options: Sectional, Regional, SFtate
SEC_TO_PROCESS = list(range(1, 33))  # Sectionals to process: 1, 33 for all
REG_TO_PROCESS = list(range(1, 9))  # Regionals to process: 1, 9 for all

# TFRRS URL suffix based on year
if YEAR == 2025:
    tfrrs_url_suffix = "?config_hnd=395"
elif YEAR == 2024:
    tfrrs_url_suffix = "?config_hnd=342"
elif YEAR == 2023:
    tfrrs_url_suffix = "?config_hnd=299"
else:
    tfrrs_url_suffix = ""
    
# -------------------------------------------------------------
# IMPORTS
# -------------------------------------------------------------
import requests
import pandas as pd
import time
import logging
import re
import gc
import sys
import random
import unicodedata
from bs4 import BeautifulSoup
from util.db_util import Database
from util.conversion_util import Conversion
from datetime import datetime
from difflib import SequenceMatcher
from sqlite3 import IntegrityError


# -------------------------------------------------------------
# GLOBALS INIT
# -------------------------------------------------------------
session = requests.Session()
warningDF = pd.DataFrame(columns=["warning", "id", "desc"])

# -------------------------------------------------------------
# WEB SCRAPER START
# -------------------------------------------------------------
print("Web scrapping started...one moment please.")
start_time = datetime.now()
print(f"Start time: {start_time.strftime('%H:%M:%S')}")

# -------------------------------------------------------------
# USER AGENTS
# -------------------------------------------------------------
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/116.0.5845.140 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5) AppleWebKit/605.1.15 "
    "(KHTML, like Gecko) Version/17.6 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Firefox/140.0",
]

# -------------------------------------------------------------
# WARNINGS
# -------------------------------------------------------------
W_ATHLETE_NOT_FOUND = "Athlete not found."
W_TEAM_NOT_FOUND = "Team not found."
W_ROSTER_NOT_FOUND = "Roster not found."
W_RELAY_ATHLETES_NOT_FOUND = "Relay athletes not found."
W_GRADE_LEVEL_NOT_FOUND = "Grade level not found."
W_ATTEMPT_FAILED = "Attempt failed for URL." 
W_DUPLICATE_ATHLETE = "Duplicate athlete." 

# -------------------------------------------------------------
# EVENTS LIST
# -------------------------------------------------------------
events = []
convert = Conversion()

if GENDER == "Girls":
    events += [(GENDER, "Prelim", "100 Hurdles"), (GENDER, "Track", "100 Hurdles")]
else:
    events += [(GENDER, "Prelim", "110 Hurdles"), (GENDER, "Track", "110 Hurdles")]

events += [
    (GENDER, "Prelim", "100 Meters"), (GENDER, "Prelim", "200 Meters"),
    (GENDER, "Track", "100 Meters"), (GENDER, "Track", "200 Meters"),
    (GENDER, "Track", "400 Meters"), (GENDER, "Track", "800 Meters"),
    (GENDER, "Track", "1600 Meters"), (GENDER, "Track", "3200 Meters"),
    (GENDER, "Track", "300 Hurdles"), (GENDER, "Relay", "4 x 100 Relay"),
    (GENDER, "Relay", "4 x 400 Relay"), (GENDER, "Relay", "4 x 800 Relay"),
    (GENDER, "Field", "High Jump"), (GENDER, "Field", "Long Jump"),
    (GENDER, "Field", "Discus"), (GENDER, "Field", "Shot Put"),
    (GENDER, "Field", "Pole Vault")
]

# -------------------------------------------------------------
# CREATE CACHES
# -------------------------------------------------------------
team_cache = {}
athlete_url_cache = {}
athlete_db_cache = {}

# -------------------------------------------------------------
# PROCESS WARNING FUNCTION
# -------------------------------------------------------------
def log_warning(warning, id, desc):
    logging.warning(f" {warning} | {id} | {desc}")
    
    if id not in warningDF["id"].values:
        warningDF.loc[len(warningDF)] = {"warning": warning, "id": id, "desc": desc}
        
# -------------------------------------------------------------
# SAFE_GET FUNCTION
# -------------------------------------------------------------
def safe_get(url, delay=(0, 3), retries=5):
    for attempt in range(retries + 1):
        # delay
        if isinstance(delay, (int, float)):
            time.sleep(delay)
        else:
            time.sleep(random.uniform(*delay))

        try:
            headers = {
                "User-Agent": random.choice(USER_AGENTS),
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.9",
                "Referer": "https://www.tfrrs.org/",
            }

            response = session.get(url, headers=headers, timeout=40)
            response.raise_for_status()
            return response

        except Exception as e:
            if attempt == retries:
                log_warning(warning=W_ATTEMPT_FAILED, id=url, desc=e)
                return None
            # brief backoff before retrying
            sleep_time = min(5, (2 ** attempt)) + random.uniform(0, 1)
            time.sleep(sleep_time)

# -------------------------------------------------------------
# GET_TEAM_PAGE FUNCTION
# -------------------------------------------------------------
def get_team_page(url):
    """Return cached BeautifulSoup for a team page or None if fetch fails."""
    if url not in team_cache:
        response = safe_get(url)
        if response is None:
            # Cache the failure so we don't retry
            team_cache[url] = None
        else:
            team_cache[url] = BeautifulSoup(response.content, 'lxml')
    return team_cache.get(url)

# -------------------------------------------------------------
# NORMALIZE_NAME FUNCTION
# -------------------------------------------------------------
def normalize_name(name: str) -> str:
    """
    Normalize athlete names for exact/fuzzy matching:
    - Uppercase
    - Remove accents
    - Convert all whitespace to single space
    - Strip leading/trailing spaces
    - Remove non-letter characters except spaces
    """
    if not name:
        return ""

    # Unicode normalize to separate accents
    name = unicodedata.normalize("NFKD", name)

    # Remove accents
    name = "".join([c for c in name if not unicodedata.combining(c)])

    # Replace all weird spaces (thin, hair, non-breaking, zero-width) with normal space
    name = re.sub(r"[\u00A0\u2000-\u200B\u202F\u205F\u3000]", " ", name)

    # Replace multiple spaces with single space
    name = re.sub(r"\s+", " ", name)

    # Strip leading/trailing spaces
    name = name.strip()

    # Remove punctuation except spaces
    name = re.sub(r"[^\w\s]", "", name)

    # Uppercase for case-insensitive comparison
    name = name.upper()

    return name


# -------------------------------------------------------------
# TEAM_MAPPING FUNCTION
# -------------------------------------------------------------
def team_mapping(team):
    if team == "Elkhart Christian":
        return "Elkhart Christian Academy"
    elif team == "DeMotte Christian":
        return "Covenant Christian DeMotte"
    elif team == "LCA":
        return "Lighthouse Christian Academy"
    else:
        return team

# -------------------------------------------------------------
# GRADE_NORMALIZATION FUNCTION
# -------------------------------------------------------------
def grade_normalization(grade):

    if grade in ["Freshman", "Fr", "FR", "9"]:
        return "FR"
    elif grade in ["Sophomore", "So", "SO", "10"]:
        return "SO"
    elif grade in ["Junior", "Jr", "JR", "11"]:
        return "JR"
    elif grade in ["Senior", "Sr", "SR", "12"]:
        return "SR"
    else:
        return "Unknown"

# -------------------------------------------------------------
# GET_OR_CREATE_ATHLETE
# -------------------------------------------------------------
def process_athlete(first, last, school_id, gender, grad_year):
    first_key = (first or "").upper().strip()
    last_key = (last or "").upper().strip()

    key = (first_key, last_key, school_id, grad_year)

    if key in athlete_db_cache:
        return athlete_db_cache[key]

    athlete_id = db.get_athlete_id(first_key, last_key, school_id, grad_year)
    
    if athlete_id is None:
        athlete_id = db.insert_athlete(school_id, first_key, last_key, gender, grad_year, commit=False)

    athlete_db_cache[key] = athlete_id
    
    return athlete_id

# -------------------------------------------------------------
# FUZZY_RATIO FUNCTION
# -------------------------------------------------------------
def fuzzy_ratio(a, b):
    if not a or not b:
        return 0
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

# -------------------------------------------------------------
# FUZZY_NAME_MATCH FUNCTION
# -------------------------------------------------------------
def fuzzy_name_match(name1, name2, threshold=0.85):
    score = fuzzy_ratio(name1, name2)
    return score >= threshold, score
    
# -------------------------------------------------------------
# build_tfrrs_url
# -------------------------------------------------------------

def build_tfrrs_url(team, gender, suffix):
    """Build the TFRRS team URL."""
    team_clean = re.sub(r'[^a-zA-Z0-9_ -]', '', (team or "").strip())
    team_clean = re.sub(r'\sHS$', '', team_clean, flags=re.IGNORECASE)
    team_clean = re.sub(r'\s', '_', team_clean)
    
    url = f"https://indiana.tfrrs.org/teams/tf/{team_clean}"
    url += "_m.html" if (gender or "").lower() in ("boys", "m", "male") else "_f.html"
    url += suffix
    
    return url

# -------------------------------------------------------------
# parse_roster_table
# -------------------------------------------------------------
def parse_roster_table(team_soup):
    """Extract roster information from team page."""
    roster_header = team_soup.find("h3", string=re.compile("ROSTER", re.I))
    roster_table = roster_header.find_next("table") if roster_header else None
    
    if roster_table is None:
        return None, []
    
    rows = roster_table.find_all("tr")
    roster_list = []
    
    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 2:
            continue
        
        name_text = cols[0].get_text(strip=True)
        grade_text = cols[1].get_text(strip=True) if cols[1] else None
        
        if not name_text or not name_text.strip():
            continue
        
        # Parse name
        if "," in name_text:
            r_last, r_first = map(str.strip, name_text.split(",", 1))
        else:
            parts = name_text.split()
            r_first = parts[0]
            r_last = " ".join(parts[1:]) if len(parts) > 1 else ""
        
        roster_list.append((normalize_name(r_first), normalize_name(r_last), r_first, r_last, grade_text))
    
    return roster_table, roster_list

# -------------------------------------------------------------
# try_exact_match
# -------------------------------------------------------------
def try_exact_match(name_parts, roster_list):
    """Try all possible first/last splits for exact matching."""
    for split_index in range(1, len(name_parts)):
        first_try = " ".join(name_parts[:split_index])
        last_try = " ".join(name_parts[split_index:])
        norm_first_try = normalize_name(first_try)
        norm_last_try = normalize_name(last_try)
        
        for r_first_norm, r_last_norm, r_first, r_last, grade_text in roster_list:
            if r_first_norm == norm_first_try and r_last_norm == norm_last_try:
                return True, r_first, r_last, grade_text
    
    return False, "", "", None

# -------------------------------------------------------------
# try_fuzzy_match
# -------------------------------------------------------------
def try_fuzzy_match(name_parts, roster_list, full_name, url):
    """Try fuzzy matching as fallback."""
    norm_first = normalize_name(name_parts[0])
    norm_last = normalize_name(" ".join(name_parts[1:]))
    
    for r_first_norm, r_last_norm, r_first, r_last, grade_text in roster_list:
        first_ok = fuzzy_ratio(r_first_norm, norm_first) >= 0.75
        last_ok = fuzzy_ratio(r_last_norm, norm_last) >= 0.88
        full_ok = fuzzy_ratio(r_first_norm + r_last_norm, norm_first + norm_last) >= 0.80
        
        if (first_ok and last_ok) or full_ok:
            log_warning(
                "Fuzzy match used",
                id=f"{full_name} <-> {r_first} {r_last}",
                desc=f"{url}"
            )
            return True, r_first, r_last, grade_text
    
    return False, "", "", None

# -------------------------------------------------------------
# try_top_marks_match
# -------------------------------------------------------------
def try_top_marks_match(team_soup, name_parts, full_name, url):
    """Try matching in Top Marks section as last resort."""
    headers = team_soup.find_all(["h2", "h3"])
    top_marks_header = None
    
    for h in headers:
        if "top marks" in h.get_text(" ", strip=True).lower():
            top_marks_header = h
            break
    
    if not top_marks_header:
        return False, "", "", None
    
    tm_table = top_marks_header.find_next("table")
    if not tm_table:
        return False, "", "", None
    
    tm_rows = tm_table.find_all("tr")
    norm_first = normalize_name(name_parts[0])
    norm_last = normalize_name(" ".join(name_parts[1:]))
    
    for row in tm_rows:
        cols = row.find_all("td")
        if len(cols) < 2:
            continue
        
        name_text = cols[1].get_text(strip=True)
        grade_text = cols[2].get_text(strip=True) if len(cols) > 2 else None
        
        if not name_text or not name_text.strip():
            continue
        
        # Parse name
        if "," in name_text:
            tm_last, tm_first = map(str.strip, name_text.split(",", 1))
        else:
            parts = name_text.split()
            tm_first = parts[0]
            tm_last = " ".join(parts[1:]) if len(parts) > 1 else ""
        
        tm_first_norm = normalize_name(tm_first)
        tm_last_norm = normalize_name(tm_last)
        
        first_ok = fuzzy_ratio(tm_first_norm, norm_first) >= 0.75
        last_ok = fuzzy_ratio(tm_last_norm, norm_last) >= 0.88
        full_ok = fuzzy_ratio(tm_first_norm + tm_last_norm, norm_first + norm_last) >= 0.80
        
        if (first_ok and last_ok) or full_ok:
            log_warning(
                "Top Marks used",
                id=f"{full_name} <-> {tm_first} {tm_last}",
                desc=url
            )
            return True, tm_first, tm_last, grade_text
    
    return False, "", "", None

# -------------------------------------------------------------
# calculate_grad_year
# -------------------------------------------------------------
def calculate_grad_year(grade_text, year, full_name):
    """Calculate graduation year from grade level."""
    grade_text = grade_normalization(grade_text)
    GRADE_TO_OFFSET = {"SR": 0, "JR": 1, "SO": 2, "FR": 3}
    offset = GRADE_TO_OFFSET.get(grade_text, -1)
    
    if offset == -1:
        log_warning(warning=W_GRADE_LEVEL_NOT_FOUND, id=full_name, desc=grade_text)
        return 9999
    
    return year + offset


# -------------------------------------------------------------
# get_name FUNCTION 
# -------------------------------------------------------------
def get_name(full_name, team, gender, year, suffix=""):
    """
    Return standardized first, last, grad_year for an athlete.
    - Exact match tries all possible first/last splits.
    - Fuzzy match is fallback if exact match fails.
    """
    team = team_mapping(team)
    full_name = (full_name or "").strip()
    name_parts = full_name.split()
    
    if len(name_parts) == 0:
        return "", "", 9999
    
    # Build team URL
    url = build_tfrrs_url(team, gender, suffix)
    
    # Get team page
    team_soup = get_team_page(url)
    if team_soup is None:
        log_warning(warning=W_TEAM_NOT_FOUND, id=team, desc=url)
        log_warning(warning=W_ATHLETE_NOT_FOUND, id=full_name, desc=url)
        return full_name.upper(), full_name.upper(), 9999
    
    # Check cache
    key = (url, full_name.upper())
    if key in athlete_url_cache:
        return athlete_url_cache[key]
    
    # Parse roster table
    roster_table, roster_list = parse_roster_table(team_soup)
    if roster_table is None:
        log_warning(warning=W_ROSTER_NOT_FOUND, id=team, desc=url)
        log_warning(warning=W_ATHLETE_NOT_FOUND, id=full_name, desc=url)
        return full_name.upper(), full_name.upper(), 9999
    
    # Strategy 1: Try exact match
    matched, roster_first, roster_last, match_grade_text = try_exact_match(name_parts, roster_list)
    
    # Strategy 2: Try fuzzy match (if exact failed)
    if not matched:
        matched, roster_first, roster_last, match_grade_text = try_fuzzy_match(
            name_parts, roster_list, full_name, url
        )
    
    # If matched on roster, calculate grad year and return
    if matched:
        grad_year = calculate_grad_year(match_grade_text, year, full_name)
        result = (roster_first.upper(), roster_last.upper(), grad_year)
        athlete_url_cache[key] = result
        return result
    
    # Strategy 3: Try Top Marks section (last resort)
    matched, roster_first, roster_last, match_grade_text = try_top_marks_match(
        team_soup, name_parts, full_name, url
    )
    
    # If matched in Top Marks, calculate grad year and return
    if matched:
        grad_year = calculate_grad_year(match_grade_text, year, full_name)
        result = (roster_first.upper(), roster_last.upper(), grad_year)
        athlete_url_cache[key] = result
        return result
    
    # Still not found - return original name uppercased
    log_warning(warning=W_ATHLETE_NOT_FOUND, id=full_name, desc=url)
    result = (full_name.upper(), full_name.upper(), 9999)
    athlete_url_cache[key] = result
    return result

# -------------------------------------------------------------
# MAIN LOGIC - SCRAPING MEETS & EVENTS (build master_result_list)
# -------------------------------------------------------------
db = Database(DB_PATH)

# insert tfrrs link info if needed (keeps your previous logic)
try:
    url = f'https://in.tfrrs.org/tournament.html?year={YEAR}'
    response = safe_get(url)
    soup = BeautifulSoup(response.text, 'html.parser') if response is not None else None
except Exception:
    soup = None
    
for meet_type in MEET_TYPE_LIST:
    if soup is None:
        break
    if meet_type == "Sectional":
        link1 = soup.find_all('a', href=True, string='Section 1')
        link2 = soup.find_all('a', href=True, string='Section 2')
    elif meet_type == "Regional":
        link1 = soup.find_all('a', href=True, string='Region 1')
        link2 = soup.find_all('a', href=True, string='Region 2')
    else:
        continue

    if link1 and link2:
        # keep original logic but guard indexing
        try:
            link_id1 = link1[0]['href'].split('/')[4] if GENDER == "Girls" else link1[1]['href'].split('/')[4]
            link_id2 = link2[0]['href'].split('/')[4] if GENDER == "Girls" else link2[1]['href'].split('/')[4]
            increment = abs(int(link_id1) - int(link_id2))
            db.insert_tfrrs(YEAR, GENDER, meet_type, link_id1, increment, commit=False)
        except Exception as e:
            logging.warning(f"Could not parse link ids for {meet_type}: {e}")

# Initialize links/increments from DB (same logic as before)
df_info = db.get_tfrrs_info(YEAR)
info = ()
for meet_type in MEET_TYPE_LIST:
    girls_df = df_info[(df_info['gender'] == "Girls") & (df_info['meet_type'] == meet_type)]
    boys_df = df_info[(df_info['gender'] == "Boys") & (df_info['meet_type'] == meet_type)]

    girls_link_id = girls_df['link_id'].iloc[0] if not girls_df.empty else 0
    boys_link_id = boys_df['link_id'].iloc[0] if not boys_df.empty else 0

    girls_increment = girls_df['increment'].iloc[0] if not girls_df.empty else 1
    boys_increment = boys_df['increment'].iloc[0] if not boys_df.empty else 1

    num = 32 if meet_type == "Sectional" else 8 if meet_type == "Regional" else 1

    info += ((meet_type, girls_link_id, boys_link_id, girls_increment, boys_increment, num),)

master_result_list = []

print("Starting meet processing.")
for tourny in info:
    tourny_type, girls_link_id, boys_link_id, girls_increment, boys_increment, num_meets = tourny
    id = girls_link_id if GENDER == "Girls" else boys_link_id
    id_increment = girls_increment if GENDER == "Girls" else boys_increment

    for i in range(num_meets):
        if (tourny_type == "Sectional" and (i + 1) not in SEC_TO_PROCESS) or \
           (tourny_type == "Regional" and (i + 1) not in REG_TO_PROCESS):
            id += id_increment
            continue
        else:
            print(f"{GENDER} {tourny_type} {str(i + 1)}")

        base_url = f"https://www.tfrrs.org/results/{id}"
        id += id_increment

        base_url_response = safe_get(base_url)
        if base_url_response is None:
            logging.warning(f"Failed to fetch base URL {base_url}")
            continue

        base_soup = BeautifulSoup(base_url_response.text, 'lxml')

        for event_info in events:
            event_gender, event_type, event_name = event_info

            link_tag = base_soup.find("a", string=event_name)
            if link_tag is None:
                logging.warning(f"Event '{event_name}' not found on page {base_url}")
                continue

            event_url = link_tag.get("href")
            if not event_url:
                logging.warning(f"No href for event {event_name} on {base_url}")
                continue

            if event_url.startswith("/"):
                event_url = "https://www.tfrrs.org" + event_url

            response = safe_get(event_url)
            if response is None:
                logging.warning(f"Failed to fetch event URL {event_url}")
                continue

            event_soup = BeautifulSoup(response.text, 'lxml')

            # Extract hidden classes from <style>
            all_styles = event_soup.find_all("style")
            ignore_classes = set()
            for style in all_styles:
                hidden = re.findall(r'\.(round_\d+_\d+)\s*{[^}]*display\s*:\s*none', style.text)
                ignore_classes.update(hidden)

            # Get meet info
            meet_info = event_soup.find_all("div", {"class": "panel-heading-normal-text"})
            meet_host = meet_info[1].text.strip() if len(meet_info) > 1 else ""
            meet_host = meet_host.split("-")[0].strip() if "-" in meet_host else meet_host

            # Get results table safely
            tbodies = event_soup.find_all('tbody')
            if event_type == "Prelim":
                results = tbodies[1] if len(tbodies) > 1 else (tbodies[0] if tbodies else None)
            else:
                results = tbodies[0] if tbodies else None

            if results is None:
                logging.warning(f"No results table found for {event_name} at {event_url}")
                response.close()
                continue

            rows = results.find_all('tr')

            for r in rows:
                cells = r.find_all('td')
                if len(cells) <= 2:
                    continue

                visible_texts = [c.text.strip() for c in cells if not any(cls in ignore_classes for cls in c.get('class', []))]

                result_list = [
                    meet_host, tourny_type, i + 1, event_name, event_gender, event_type
                ] + visible_texts

                if event_type == "Relay":
                    first, last = "", ""
                else:
                    # Attempt to get athlete name
                    # result_list indices based on earlier layout; keep original mapping
                    try:
                        first, last, grad_year = get_name(result_list[7], result_list[9], result_list[4], YEAR, suffix=tfrrs_url_suffix)
                    except Exception as e:
                        logging.warning(f"Error getting name for row: {e}")
                        first, last, grad_year = "", "", None

                # standardize results into a consistent schema
                try:
                    if event_type == "Relay":
                        result2 = convert.time_to_seconds(result_list[9])
                        standardized_result_list = [*result_list[:9], '-', result_list[9], result_list[10], first, last, result2, '-']
                    elif event_type == "Prelim":
                        result2 = convert.time_to_seconds(result_list[10])
                        standardized_result_list = [
                            result_list[0], result_list[1], result_list[2], result_list[3],
                            result_list[4], result_list[5], result_list[6], result_list[9],
                            result_list[7], result_list[8], result_list[10], '-', first, last, result2, grad_year
                        ]
                    elif event_type == "Field":
                        result2 = convert.distance_to_inches(result_list[10])
                        standardized_result_list = [
                            result_list[0], result_list[1], result_list[2], result_list[3],
                            result_list[4], result_list[5], result_list[6], result_list[9],
                            result_list[7], result_list[8], result_list[10], result_list[12],
                            first, last, result2, grad_year
                        ]
                    else:
                        result2 = convert.time_to_seconds(result_list[10])
                        standardized_result_list = [
                            result_list[0], result_list[1], result_list[2], result_list[3],
                            result_list[4], result_list[5], result_list[6], result_list[9],
                            result_list[7], result_list[8], result_list[10], result_list[11],
                            first, last, result2, grad_year
                        ]
                except Exception as e:
                    logging.warning(f"Error standardizing result row: {e}")
                    continue

                standardized_result_list[6] = standardized_result_list[6].replace("  ", " ")
                master_result_list.append(standardized_result_list)

            response.close()
        gc.collect()

# -------------------------------------------------------------
# CLEANUP SCRAPING
# -------------------------------------------------------------
session.close()

# -------------------------------------------------------------
# SAVE RESULTS TO CSV
# -------------------------------------------------------------
df = pd.DataFrame(master_result_list, columns=[
    'Meet Host', 'Meet Type', 'Meet #', 'Event', 'Gender', 'Type',
    'Place', 'Team', 'Name', 'Year', 'Result', 'Score', 'First',
    'Last', 'Result2', 'GradYear'
])

meetDesc = MEET_TYPE_LIST[0] if len(MEET_TYPE_LIST) == 1 else "Mix"
file = f"{YEAR} {GENDER} {meetDesc} Results.csv"
df.to_csv(file, index=False)

warningFile = f"{YEAR} {GENDER} {meetDesc} Warnings.csv"
warningDF.to_csv(warningFile, index=False)

end_time = datetime.now()
print(f"End time:   {end_time.strftime('%H:%M:%S')}")
elapsed = end_time - start_time
print(f"Elapsed time: {elapsed}")
print(f"{file} and {warningFile} created.")
print("Web scrapping completed.")


Web scrapping started...one moment please.
Start time: 20:13:57
Starting meet processing.
Girls State 1
End time:   20:18:03
Elapsed time: 0:04:05.808249
Web scrapping completed.


In [118]:

# -------------------------------------------------------------
# DB PROCESSING - load CSV into DB
# -------------------------------------------------------------
print("Loading data into DB...please wait.")

start_time = datetime.now()
print(f"Start time: {start_time.strftime('%H:%M:%S')}")

# Read CSV into DataFrame
df = pd.read_csv(file, na_values=["", "NA", "N/A"], keep_default_na=False)

# load meets in db
print("Inserting meets.")
meet_df = df[['Meet Host', 'Meet Type', 'Meet #', 'Gender']]
meet_df = meet_df.drop_duplicates()

for index, row in meet_df.iterrows():
    meet_id = db.get_meet_id(row['Meet Type'], row['Meet #'], YEAR, row['Gender'])
    if meet_id is None:
        db.insert_meet(row['Meet Host'], row['Meet Type'], row['Meet #'], YEAR, row['Gender'], commit=False)

# load athletes
print("Inserting individual athletes and results.")
athlete_df = df[df['Type'] != 'Relay']

# First pass: ensure athletes exist
for index, row in athlete_df.iterrows():
    team = row["Team"]
    school_id = db.get_school_id(team)
    first = str(row["First"])
    last = str(row["Last"])
    gender = row["Gender"]
    grad_year = row["GradYear"]
    process_athlete(first, last, school_id, gender, grad_year)

# Second pass: insert athlete results
for index, row in athlete_df.iterrows():
    school_id = db.get_school_id(row["Team"])
    first = row["First"]
    last = row["Last"]
    grad_year = row["GradYear"]
    
    athlete_id = process_athlete(first, last, school_id, row["Gender"], grad_year)

    meet_type = row["Meet Type"]
    meet_num = row["Meet #"]
    gender = row["Gender"]
    meet_id = db.get_meet_id(meet_type, int(meet_num), int(YEAR), gender)

    event = row["Event"]
    grade = row["Year"]

    grade = grade_normalization(grade)

    result = row["Result"]
    result2 = row["Result2"]
    place = row["Place"]

    event_type = "Prelim" if row["Type"] == "Prelim" else "Final"

    athlete_result = db.get_athlete_result(athlete_id, meet_id, event, event_type)
    if athlete_result is None:
        db.insert_athlete_result(athlete_id, meet_id, event, event_type, grade, result, result2, place, commit=False)

# Load relay results
print("Inserting relay athletes and results.")
relay_df = df[df['Type'] == 'Relay']

for index, row in relay_df.iterrows():
    school_id = db.get_school_id(row["Team"])
    meet_type = row["Meet Type"]
    meet_num = row["Meet #"]
    gender = row["Gender"]
    meet_id = db.get_meet_id(meet_type, int(meet_num), int(YEAR), gender)

    event = row["Event"]
    result = row["Result"]
    result2 = row["Result2"]
    place = row["Place"]
    athlete_names = row["Name"]

    relay_result = db.get_relay_result(school_id, meet_id, event)

    if relay_result is None:
        relay_id = db.insert_relay_result(school_id, meet_id, event, result, result2, place, athlete_names, commit=False)

        if pd.isna(athlete_names):
            log_warning(warning = W_RELAY_ATHLETES_NOT_FOUND, id = relay_id, desc = row["Team"] + " " + event)
            
            # warning_msg = f"Bad relay names {row['Team']} {event} {athlete_names}"
            # logging.warning(warning_msg)
            # warningDF.loc[len(warningDF)] = {"full_name": row["Team"] + " " + event, "team_url": athlete_names}
            continue

        for full_name in athlete_names.split(","):
            full_name = full_name.strip()
            if not full_name:
                continue
            
            first, last, grad_year = get_name(full_name, row["Team"], gender, YEAR, suffix=tfrrs_url_suffix)
            athlete_id = process_athlete(first, last, school_id, gender, grad_year)

            try:
                db.insert_relay_athlete(relay_id, athlete_id, commit=False)
            except IntegrityError:
                log_warning(warning = W_DUPLICATE_ATHLETE, id = (first + " " + last), desc = row["Team"] + " " + event)

# Save warnings (again) after DB processing
warningDF.to_csv(warningFile, index=False)

end_time = datetime.now()
print(f"End time:   {end_time.strftime('%H:%M:%S')}")
elapsed = end_time - start_time
print(f"Elapsed time: {elapsed}")

db.do_commit()
print(f"{warningFile} updated.")
print(f"Successfully completed and DB updated!!")

Loading data into DB...please wait.
Start time: 20:31:45
Inserting meets.
Inserting individual athletes and results.
Inserting relay athletes and results.
End time:   20:31:58
Elapsed time: 0:00:12.711962
Successfully completed and DB updated!!
