In [94]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
import time
import logging
from typing import Optional, Dict
from util.db_util import Database  # your existing DB wrapper

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger("TrackVerifier")


class TrackMeetVerifier:
    def __init__(self, db_path="db/Track.db"):
        self.db = Database(db_path)
        self.state_values = {1: 10, 2: 8, 3: 7, 4: 6, 5: 5, 6: 4, 7: 3, 8: 2, 9: 1}
        self.sec_reg_values = {1: 10, 2: 8, 3: 6, 4: 5, 5: 4, 6: 3, 7: 2, 8: 1}
        self._session = requests.Session()
        self._user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        ]

    # ---------------------------
    # Score calculation
    # ---------------------------

    def get_points(self, df: pd.DataFrame, scores: Dict[int, float]) -> None:
        if df is None or df.empty:
            return
    
        for event in df["event"].unique():
            event_df = df[df["event"] == event].copy()
            event_df = event_df.sort_values("place").reset_index(drop=True)
    
            scoring_place = 1  # “ghost” scoring place (1,2,3,...)
            i = 0
            n = len(event_df)
    
            while i < n:
                # Determine tie group by actual place
                actual_place = event_df.iloc[i]["place"]
                tie_group = event_df[event_df["place"] == actual_place]
                tie_count = len(tie_group)
    
                # Collect scoring places for this tie
                scoring_places = []
                for k in range(tie_count):
                    if (scoring_place + k) in self.values:
                        scoring_places.append(scoring_place + k)
    
                # Compute average points for the tie group
                pts = 0
                if scoring_places:
                    pts = sum(self.values[p] for p in scoring_places) / tie_count
    
                # Assign points to all athletes in the tie group
                for idx, row in tie_group.iterrows():
                    school = row["school_id"]
                    scores[school] = scores.get(school, 0) + pts
    
                # Advance scoring_place by number of athletes in tie group
                scoring_place += tie_count
                i += tie_count


    def calculate_meet_scores(self, meet_type, meet_number, year, gender, low=0, high=10000):
        self.values = self.sec_reg_values if meet_type != "State" else self.state_values
        top_num = len(self.values)
        meet_id = self.db.get_meet_id(meet_type, meet_number, year, gender)
        individual_df = self.db.get_top_results_from_meet(top_num, meet_id, "Final", low, high)
        relay_df = self.db.get_top_relays_from_meet(top_num, meet_id, low, high)
        scores = {}
        self.get_points(individual_df, scores)
        self.get_points(relay_df, scores)
        df = pd.DataFrame(list(scores.items()), columns=["school_id", "Score"])
        df["Score"] = df["Score"].round(2)
        df = df.sort_values("Score", ascending=False).reset_index(drop=True)
        return df["Score"]  # Only return the scores for comparison

    def mapping(self, meet_type, gender, year):
        key = str(year) + gender[0] + meet_type[0:2]

        meet_ids = {
            # 2023
            "2023GRe": (81231, 2),  # Girls Regionals
            "2023BRe": (81230, 2),  # Boys Regionals
            "2023GSe": (80594, 2),  # Girls Sectionals
            "2023BSe": (80593, 2),  # Boys Sectionals
            "2023GSt": (81261, 2),  # Girls State
            "2023BSt": (81260, 2),  # Boys State
        
            # 2024
            "2024GRe": (86949, 1),
            "2024BRe": (86958, 1),
            "2024GSe": (86655, 2),
            "2024BSe": (86654, 2),
            "2024GSt": (86967, 2),  
            "2024BSt": (86966, 2), 
        
            # 2025
            "2025GRe": (92454, 1),
            "2025BRe": (92462, 1),
            "2025GSe": (92371, 2),
            "2025BSe": (92370, 2),
            "2025GSt": (92612, 2),  
            "2025BSt": (92613, 2),             
        }

        return meet_ids.get(key)

    # ---------------------------
    # URL builder
    # ---------------------------
    def get_tfrrs_url(self, meet_type, meet_number, gender, year="2025"):
        
        gender_code = "Girls" if gender == "Girls" else "Boys"
        base_url = "https://indiana.tfrrs.org/results/"
        id, increment = self.mapping(meet_type, gender, year)
        meet_id = id + (int(meet_number) - 1) * increment
        return f"{base_url}{meet_id}"
        #return f"{base_url}{meet_id}/IHSAA_{meet_type}_{meet_number}_{gender_code}"

        return ""

    # ---------------------------
    # Fetch TFRRS scores (403-safe)
    # ---------------------------
    def fetch_tfrrs_scores(self, meet_type, meet_number, gender, year="2025") -> Optional[pd.Series]:
        url = self.get_tfrrs_url(meet_type, meet_number, gender, year)
        if not url:
            return None

        # visit homepage to get cookies
        try:
            self._session.get("https://indiana.tfrrs.org",
                              headers={"User-Agent": random.choice(self._user_agents)}, timeout=10)
        except:
            pass

        for attempt in range(5):
            headers = {
                "User-Agent": random.choice(self._user_agents),
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.9",
                "Referer": "https://www.google.com/",
                "Connection": "keep-alive",
            }
            time.sleep(random.uniform(2.0, 6.0))

            try:
                resp = self._session.get(url, headers=headers, timeout=12)
                if resp.status_code in (403, 429):
                    logger.warning("Blocked (403/429) on attempt %d for %s", attempt + 1, url)
                    time.sleep(random.uniform(3.0, 7.0))
                    continue

                resp.raise_for_status()
                soup = BeautifulSoup(resp.content, "lxml")

                # Find the correct table with headers RANK, TEAM, SCORE
                tbodies = soup.find_all("tbody")
                results = None
                for tbody in tbodies:
                    table = tbody.find_parent("table")
                    headers = [th.get_text(strip=True) for th in table.find_all("th")]
                    if headers[:3] == ["RANK", "TEAM", "SCORE"]:
                        results = tbody
                        break

                if results is None:
                    logger.info("No score table found for %s", url)
                    return None

                scores = []
                for r in results.find_all("tr"):
                    cols = r.find_all("td")
                    if len(cols) < 3:
                        continue
                    try:
                        score = float(cols[2].text.strip().replace(",", ""))
                        scores.append(score)
                    except:
                        continue

                if scores:
                    return pd.Series(scores)
                else:
                    return None

            except Exception as e:
                logger.warning("Fetch attempt %d failed: %s", attempt + 1, e)
                time.sleep(random.uniform(2.0, 5.0))
                continue

        logger.warning("Failed to fetch TFRRS scores after 5 attempts for %s", url)
        return None

    # ---------------------------
    # Compare scores (ignore team names)
    # ---------------------------
    def compare_scores(self, db_scores: pd.Series, tfrrs_scores: pd.Series):
        if db_scores is None or db_scores.empty or tfrrs_scores is None or tfrrs_scores.empty:
            return None
        # align by index
        min_len = min(len(db_scores), len(tfrrs_scores))
        comparison = pd.DataFrame({
            "DB_Score": db_scores.iloc[:min_len].values,
            "TFRRS_Score": tfrrs_scores.iloc[:min_len].values,
        })
        comparison["Difference"] = comparison["DB_Score"] - comparison["TFRRS_Score"]
        comparison["Match"] = comparison["Difference"].abs() < 0.01
        return comparison

    # ---------------------------
    # Verify a range of meets
    # ---------------------------
    def verify_meet_range(self, meet_type, year, gender, start, end):
        results = {}
        for meet_number in range(start, end + 1):
            db_scores = self.calculate_meet_scores(meet_type, meet_number, year, gender)
            tfrrs_scores = self.fetch_tfrrs_scores(meet_type, meet_number, gender, year)
            comparison = self.compare_scores(db_scores, tfrrs_scores)
            url = self.get_tfrrs_url(meet_type, meet_number, gender, year)
            results[meet_number] = {
                "DB_Scores": db_scores,
                "TFRRS_Scores": tfrrs_scores,
                "Comparison": comparison,
                "URL": url,
            }
            logger.info("Processed %s %d: TFRRS fetched=%s", meet_type, meet_number, tfrrs_scores is not None)
            
            if (results[meet_number]["Comparison"]["Match"] == False).any():
                print("Mismatch — Meet", url)
                print(results[meet_number]["Comparison"])
            
        return results


# ---------------------------
# Example usage
# ---------------------------
if __name__ == "__main__":
    verifier = TrackMeetVerifier()

    for year in [2025]:
        for gender in ["Boys"]:
            print(f"Processing: {year} {gender}")
            # Sectionals 
            sec_start = 1
            sec_end = 32
            #sectionals_results = verifier.verify_meet_range("Sectional", year, gender, sec_start, sec_end)
            
            # Regionals
            reg_start = 1
            reg_end = 8
            #regionals_results = verifier.verify_meet_range("Regional", year, gender, reg_start, reg_end)
        
            # State
            state_results = verifier.verify_meet_range("State", year, gender, 1, 1)

    print("done")


Processing: 2025 Boys


2025-12-14 16:02:16,224 [INFO] Processed State 1: TFRRS fetched=True


done
