In [1]:
!pip install xgboost



In [23]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

# XGBoost
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except ImportError:
    HAS_XGB = False
    print("xgboost error")


# 1. Helper methods
def normalize_squad_column(df):
    if "Squad" in df.columns:
        pass
    elif "Team" in df.columns:
        df = df.rename(columns={"Team": "Squad"})
    else:
        raise ValueError(f"No Squad/Team column found: {df.columns.tolist()}")
    df["Squad"] = df["Squad"].astype(str).str.strip()
    return df


def compute_form_score(last5):
    if not isinstance(last5, str):
        return 0.0
    mapping = {"W": 1, "D": 0, "L": -1}
    return sum(mapping.get(ch, 0) for ch in last5.strip())


# ==========================================================
# 2. Ühe hooaja match-level dataset
# ==========================================================
def build_match_dataset_for_season(season_dir: Path, season_weight: float) -> pd.DataFrame:
    """
    season_dir: Path('CSV_files/Season_2025-2026')
    Eeldab, et kaustas on:
      - *squad_team_merged*.csv
      - *fixtures*.csv
    """
    season_dir = Path(season_dir)
    season_label = season_dir.name

    # --- team stats ---
    stats_files = list(season_dir.glob("*squad_team_merged*.csv"))
    if not stats_files:
        raise FileNotFoundError(f"No *squad_team_merged*.csv in {season_dir}")
    stats = pd.read_csv(stats_files[0])
    stats = normalize_squad_column(stats)

    # lisa vormiskoor
    if "Last 5" in stats.columns:
        stats["form_score"] = stats["Last 5"].apply(compute_form_score)
    else:
        stats["form_score"] = 0.0

    # --- fixtures ---
    fix_files = list(season_dir.glob("*fixtures*.csv"))
    if not fix_files:
        raise FileNotFoundError(f"No *fixtures*.csv in {season_dir}")
    fixtures = pd.read_csv(fix_files[0])

    fixtures["Home_Goals"] = pd.to_numeric(fixtures["Home_Goals"], errors="coerce")
    fixtures["Away_Goals"] = pd.to_numeric(fixtures["Away_Goals"], errors="coerce")

    fixtures["goal_diff"] = fixtures["Home_Goals"] - fixtures["Away_Goals"]
    fixtures["result"] = np.where(
        fixtures["goal_diff"] > 0, "H",
        np.where(fixtures["goal_diff"] < 0, "A", "D")
    )
    fixtures["season"] = season_label

    # --- merge home stats ---
    home_stats = stats.add_prefix("home_")
    merged = fixtures.merge(
        home_stats, left_on="Home_Team", right_on="home_Squad", how="left"
    )

    # --- merge away stats ---
    away_stats = stats.add_prefix("away_")
    merged = merged.merge(
        away_stats, left_on="Away_team", right_on="away_Squad",
        how="left", suffixes=("", "_dupAway")
    )

    merged = merged.drop(columns=["home_Squad", "away_Squad"], errors="ignore")

    # hooaja kaal (uuem hooaeg saab suurema kaalu)
    merged["season_weight"] = season_weight

    return merged


# ==========================================================
# 3. Kõigi hooaegade koondamine
# ==========================================================
def build_full_dataset(base_dir: str = "CSV_files") -> pd.DataFrame:
    base = Path(base_dir)
    season_dirs = sorted(base.glob("Season_*"))
    if not season_dirs:
        raise ValueError("CSV_files kaustast ei leitud ühtegi Season_* kausta.")

    all_dfs = []
    n = len(season_dirs)

    for i, season_dir in enumerate(season_dirs):
        # lineaarne kaal vahemikus ~0.6 ... 1.0
        weight = 0.6 + 0.4 * (i / (n - 1)) if n > 1 else 1.0
        print(f"{season_dir.name} -> weight = {weight:.2f}")
        df_season = build_match_dataset_for_season(season_dir, season_weight=weight)
        all_dfs.append(df_season)

    full = pd.concat(all_dfs, ignore_index=True)
    return full


# ==========================================================
# 4. Andmete ettevalmistus + feature engineering
# ==========================================================
full_df = build_full_dataset("CSV_files")
full_df = full_df.dropna(subset=["result"])

# teeme diff_ feature'id: home_* - away_*
for col in list(full_df.columns):
    if col.startswith("home_"):
        base = col[5:]
        away_col = "away_" + base
        if away_col in full_df.columns:
            # väldime stringi tüüpi
            if full_df[col].dtype != "O" and full_df[away_col].dtype != "O":
                full_df[f"diff_{base}"] = full_df[col] - full_df[away_col]

# feature'id = kõik diff_ numbrilised veerud
feature_cols = [
    c for c in full_df.columns
    if c.startswith("diff_") and full_df[c].dtype != "O"
]

X = full_df[feature_cols]
y = full_df["result"].astype(str)
w = full_df["season_weight"]

# viskame välja read, kus mõni feature on NaN
mask = X.notna().all(axis=1)
X = X[mask]
y = y[mask]
w = w[mask]

X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, w, test_size=0.2, random_state=42, stratify=y
)


# ==========================================================
# 5. Treeni kolm mudelit: logreg, rf, xgb
# ==========================================================
MODELS = {}

# --- Logistic Regression ---
logreg_model = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        random_state=42,
    )
)
logreg_model.fit(X_train, y_train, logisticregression__sample_weight=w_train)
print("LogReg accuracy:", logreg_model.score(X_test, y_test))
MODELS["logreg"] = logreg_model

# --- RandomForest ---
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    class_weight="balanced",
    random_state=42,
)
rf_model.fit(X_train, y_train, sample_weight=w_train)
print("RandomForest accuracy:", rf_model.score(X_test, y_test))
MODELS["rf"] = rf_model

# --- XGBoost (kui olemas) ---
if HAS_XGB:
    # kaardistame labelid arvudeks
    label_to_int = {"A": 0, "D": 1, "H": 2}
    int_to_label = {v: k for k, v in label_to_int.items()}

    y_train_xgb = y_train.map(label_to_int).values
    y_test_xgb = y_test.map(label_to_int).values

    xgb_model = XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="multi:softprob",
        eval_metric="mlogloss",
        random_state=42,
    )
    xgb_model.fit(X_train, y_train_xgb, sample_weight=w_train)

    y_pred_xgb = xgb_model.predict(X_test)
    xgb_acc = (y_pred_xgb == y_test_xgb).mean()
    print("XGBoost accuracy:", xgb_acc)

    MODELS["xgb"] = xgb_model
else:
    xgb_model = None
    label_to_int = None
    int_to_label = None



# ==========================================================
# 6. Ennustamine (ilma hooaega ette andmata)
# ==========================================================
def load_latest_team_stats(base_dir: str = "CSV_files") -> pd.DataFrame:
    base = Path(base_dir)
    season_dirs = sorted(base.glob("Season_*"))
    latest = season_dirs[-1]  # kõige uuem hooaeg
    stats_file = list(latest.glob("*squad_team_merged*.csv"))[0]
    stats = pd.read_csv(stats_file)
    stats = normalize_squad_column(stats)
    if "Last 5" in stats.columns:
        stats["form_score"] = stats["Last 5"].apply(compute_form_score)
    else:
        stats["form_score"] = 0.0
    return stats


LATEST_STATS = load_latest_team_stats()


def build_feature_row_for_prediction(home_team: str, away_team: str) -> pd.DataFrame:
    """
    Ehita üks feature-rida, millel on täpselt samad diff_ veerud,
    millega mudel treeniti.
    """
    home = LATEST_STATS[LATEST_STATS["Squad"] == home_team]
    away = LATEST_STATS[LATEST_STATS["Squad"] == away_team]

    if home.empty:
        raise ValueError(f"Home team '{home_team}' not found in latest stats.")
    if away.empty:
        raise ValueError(f"Away team '{away_team}' not found in latest stats.")

    home = home.reset_index(drop=True)
    away = away.reset_index(drop=True)

    data = {}
    # feature_cols sisaldab midagi stiilis: diff_GF, diff_GA, diff_xG, ...
    for col in feature_cols:
        base = col[5:]  # 'diff_GF' -> 'GF'
        if base in home.columns and base in away.columns:
            hv = home[base].iloc[0]
            av = away[base].iloc[0]
            if pd.api.types.is_numeric_dtype(type(hv)) and pd.api.types.is_numeric_dtype(type(av)):
                try:
                    data[col] = float(hv) - float(av)
                except Exception:
                    data[col] = np.nan
            else:
                data[col] = np.nan
        else:
            data[col] = np.nan

    row = pd.DataFrame([data], columns=feature_cols)
    return row


def predict_match2(home_team: str, away_team: str, model_name):
    """
    Ennustab kodutiimi / viigi / võõrsiltiimi tõenäosused.
    model_name: 'rf', 'logreg' või 'xgb' (kui xgboost olemas).
    """
    if model_name not in MODELS:
        raise ValueError(f"Unknown model '{model_name}'. Valid: {list(MODELS.keys())}")

    row = build_feature_row_for_prediction(home_team, away_team)
    model = MODELS[model_name]

    probs = model.predict_proba(row)[0]
    classes = model.classes_

    if model_name == "xgb":
        # classes on [0,1,2] -> tõlgime tagasi 'A','D','H'
        class_labels = [int_to_label[c] for c in classes]
    else:
        # logreg ja rf kasutavad otse 'A','D','H'
        class_labels = list(classes)

    prob_map = {cls: float(p) for cls, p in zip(class_labels, probs)}

    return {
        "home_team": home_team,
        "away_team": away_team,
        "model": model_name,
        "probs": {
            "home_win": prob_map.get("H", 0.0),
            "draw": prob_map.get("D", 0.0),
            "away_win": prob_map.get("A", 0.0),
        },
    }

Season_2020-2021 -> weight = 0.60
Season_2021-2022 -> weight = 0.68
Season_2022-2023 -> weight = 0.76
Season_2023-2024 -> weight = 0.84
Season_2024-2025 -> weight = 0.92
Season_2025-2026 -> weight = 1.00
LogReg accuracy: 0.5123762376237624
RandomForest accuracy: 0.5767326732673267
XGBoost accuracy: 0.5420792079207921


In [21]:
# V2 RANDOM FOREST
predict_match2("Manchester City", "Newcastle Utd", model_name="rf")

{'home_team': 'Manchester City',
 'away_team': 'Newcastle Utd',
 'model': 'rf',
 'probs': {'home_win': 0.632, 'draw': 0.216, 'away_win': 0.152}}

In [22]:
# V2 LOGREG
predict_match2("Manchester City", "Newcastle Utd", model_name="logreg")

{'home_team': 'Manchester City',
 'away_team': 'Newcastle Utd',
 'model': 'logreg',
 'probs': {'home_win': 0.5216974558399203,
  'draw': 0.3266660768352799,
  'away_win': 0.1516364673247998}}

In [19]:
# V2 XGB
predict_match2("Manchester City", "Newcastle Utd", model_name="xgb")

{'home_team': 'Manchester City',
 'away_team': 'Newcastle Utd',
 'model': 'xgb',
 'probs': {'home_win': 0.8080324530601501,
  'draw': 0.14107152819633484,
  'away_win': 0.05089602991938591}}