# Feature Selection

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import pickle
import warnings

# Ignore PerformanceWarning and UserWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

### Load Dataset

In [None]:
# Load dataset
player_match_df = pd.read_csv('datasets\player_match_df_clean.csv')

We can now look at the size of our dataset:

In [279]:
player_match_df.shape

(95207, 296)

We have a lot of features to work with here, so will we look to reduce these before building our model.

However, first we will look at removing a section of the most recent data to use as an evaluation dataset, so that we can hypothetically look at how profitable the model would have been if we had built it several years ago:

In [280]:
pct_23_24 = round((player_match_df['year'].isin([2023, 2024, 2025]).sum() / len(player_match_df)) * 100, 2)
print(f'Percentage of dataset where matches took place in 2023 or 2024: {pct_23_24}%')


Percentage of dataset where matches took place in 2023 or 2024: 12.24%


Since the 2023-2025 data accounts for around 12% of our data, this is a good chunk to evaluate our model with so we will remove this from the dataset now for later use:

In [None]:
# Extract 2024 data and save as file 
player_match_df_23_25 = player_match_df[player_match_df['year'].isin([2023,2024,2025])]
player_match_df_23_25.to_csv('datasets\player_match_df_23_25.csv', index=False)

# Remove 2024 data from original dataset
player_match_df = player_match_df[~(player_match_df['year'].isin([2023,2024,2025]))]

# Reset index
player_match_df = player_match_df.reset_index(drop=True)
player_match_df.to_csv('datasets\player_match_df_reduced.csv', index=False)

### Function to extract odds from dataset

Next we will extract the bookmaker odds from our dataset, as we don't want to use these for our model training but they will be of interest for our model evaluation.  

In [282]:
def extract_odds_df(df):
    """
    Extracts a simplified DataFrame containing only the 'odds' and 'won' columns.

    This function selects the 'odds' and 'won' columns from the input DataFrame and
    returns a new DataFrame containing just these two, aligned by index.

    Parameters:
        df (pandas.DataFrame): The original DataFrame, expected to contain 'odds' and 'won' columns.

    Returns:
        pandas.DataFrame: A new DataFrame with only 'odds' and 'won' columns.
    """
    # Extract columns
    odds_col = df.loc[:, 'odds']
    won_col = df.loc[:, 'won']

    # Create dataframe
    odds_win_df = pd.concat([odds_col, won_col], axis=1)

    return odds_win_df


In [283]:
# Run the function
odds_win_df = extract_odds_df(player_match_df)

We will use some of the most recent data as our test dataset to ensure there is no data leakage through the model being tested on matches that came before the matches in the training dataset, or through data in the training and test datasets existing for the same match since each match is split by player an opponent.

In [285]:
pct_23_24 = round((player_match_df['year'].isin([2020, 2021, 2022]).sum() / len(player_match_df)) * 100, 2)
print(f'Percentage of dataset where matches took place in 2023 or 2024: {pct_23_24}%')

Percentage of dataset where matches took place in 2023 or 2024: 14.73%


### Function to create train/test split

Since the data from 2020-2022 accounts for around 15% of our data, this will be a good proportion to use as our test dataset. Therefore we will build a function to split the dataset into a training and test set based on these years, and also make sure we also split our odds dataset on the same indexes:

In [286]:
def create_train_test_splits(df, odds_win_df):
    """
    Splits the dataset into training and testing sets based on specific years.

    This version does not use random sampling. Instead:
    - Data from the years 2020, 2021, and 2022 is used for testing.
    - All other years are used for training.
    - The 'won' and 'odds' columns are removed from the feature set prior to splitting.
    
    Parameters:
        df (pandas.DataFrame): The full DataFrame containing features, including 'year', 'won', and 'odds'.
        odds_win_df (pandas.DataFrame): A two-column DataFrame with 'odds' and 'won' values.

    Returns:
        tuple: A six-part tuple containing:
            - X_train (pandas.DataFrame): Training features.
            - X_test (pandas.DataFrame): Testing features.
            - y_train (pandas.Series): Training target values ('won').
            - y_test (pandas.Series): Testing target values ('won').
            - odds_win_train (pandas.DataFrame): Training odds and outcomes.
            - odds_win_test (pandas.DataFrame): Testing odds and outcomes.
    """
    # Create a copy 
    df_new = df.copy()

    # Remove target and odds columns from feature set
    df_new = df_new.drop(columns=['won', 'odds'])

    # Split based on year
    train_indexes = df_new[~(df_new['year'].isin([2020, 2021, 2022]))].index
    test_indexes = df_new[df_new['year'].isin([2020, 2021, 2022])].index

    # Define X_train and X_test
    X_train = df_new.loc[train_indexes]
    X_test = df_new.loc[test_indexes]

    # Define odds_win_train and odds_win_test
    odds_win_train = odds_win_df.loc[train_indexes]
    odds_win_test = odds_win_df.loc[test_indexes]

    # Define y_train and y_test 
    y_train = odds_win_train['won']
    y_test = odds_win_test['won']

    return X_train, X_test, y_train, y_test, odds_win_train, odds_win_test


In [287]:
# Run the function
X_train, X_test, y_train, y_test, odds_win_train, odds_win_test = create_train_test_splits(player_match_df, odds_win_df)

We can look at what features correlate the most with our target variable:

In [288]:
# View correlation values
top_corr_vals = abs(player_match_df.drop(columns='odds').corr()['won']).sort_values(ascending=False)
top_corr_vals.head(15)

won                                             1.000000
current_vsTop_win_rate__previous_year_diff      0.341815
pts_diff                                        0.317439
opp_current_vsTop_win_rate__previous_year       0.304896
current_series_win_rate__previous_year_diff     0.302753
player_current_vsTop_win_rate__previous_year    0.298852
current_vsTop_wins__previous_year_diff          0.298152
vsTop50_pct__previous_year_diff                 0.288245
vsTop100_pct__previous_year_diff                0.277424
grand_slam_pct__previous_year_diff              0.266510
main_tour_pct__previous_year_diff               0.266145
opp_current_vsTop_wins__previous_year           0.252394
masters_pct__previous_year_diff                 0.251940
vsTop20_pct__previous_year_diff                 0.248198
rank_diff                                       0.241616
Name: won, dtype: float64

### Function to reduce number of features based on correlation

We will now build a function that reduces the number of features iteratively based on set correlation thresholds, and trains a couple of models on each threshold to find the correlation threshold that leads to the best performing model, and returns the filtered dataset based on this optimal threshold:

In [289]:
def optimise_corr_lim_with_models(X_train, X_test, y_train, y_test):
    """
    Optimises the correlation threshold for feature selection based on model performance.

    This function:
    - Iteratively tests correlation thresholds (0.01 to 0.24) to filter features based on their
      correlation with the target.
    - For each threshold, trains two models (XGBoost and Random Forest) using only the filtered features.
    - Evaluates each model using AUC (Area Under the ROC Curve).
    - Tracks and returns the best-performing model and correlation threshold.
    - Returns filtered versions of the training and test sets using the optimal threshold.

    Parameters:
        X_train (pandas.DataFrame): Training features, including a 'year' column which is removed.
        X_test (pandas.DataFrame): Testing features, including a 'year' column which is removed.
        y_train (pandas.Series): Training labels (binary classification).
        y_test (pandas.Series): Testing labels (binary classification).

    Returns:
        tuple: A six-part tuple containing:
            - best_corr_lim (float): The optimal correlation threshold.
            - best_auc (float): The highest AUC score achieved.
            - best_model_name (str): The name of the best-performing model.
            - X_train_reduced (pandas.DataFrame): Reduced training feature set.
            - X_test_reduced (pandas.DataFrame): Reduced testing feature set.
            - results_df (pandas.DataFrame): A DataFrame with AUC results for each model and threshold.
    """
    corr_lims = [i / 100 for i in range(2, 25)]  # 0.01 to 0.24

    best_corr_lim = None
    best_auc = 0
    best_model_name = None

    X_train_new = X_train.drop(columns="year")
    X_test_new = X_test.drop(columns="year")

    results = []  # To store results for each model and threshold

    for idx, corr_lim in enumerate(corr_lims):
        if idx % 10 == 0:
            print(f"Processing iteration {idx} with corr_lim: {corr_lim}")

        # Compute correlations and filter features based on the threshold
        correlations = X_train_new.corrwith(y_train)
        features_to_use = correlations[abs(correlations) > corr_lim].index.to_list()

        if not features_to_use:
            print(f"No features left with corr_lim {corr_lim}. Skipping this iteration.")
            continue

        X_train_filtered = X_train_new[features_to_use]
        X_test_filtered = X_test_new[features_to_use]

        # Define models
        models = {
            "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
            "RandomForest": RandomForestClassifier(random_state=42)
        }

        for model_name, model in models.items():
            # Train and evaluate the model
            model.fit(X_train_filtered, y_train)
            y_pred = model.predict_proba(X_test_filtered)[:, 1]
            auc_score = roc_auc_score(y_test, y_pred)

            results.append({
                "corr_lim": corr_lim,
                "model": model_name,
                "auc": auc_score
            })

            if auc_score > best_auc:
                best_auc = auc_score
                best_corr_lim = corr_lim
                best_model_name = model_name

    # Get the best features based on the optimal correlation limit
    correlations = X_train_new.corrwith(y_train)
    best_features = correlations[abs(correlations) > best_corr_lim].index.to_list()
    X_train_reduced = X_train_new[best_features]
    X_test_reduced = X_test_new[best_features]

    removed_features = X_train.shape[1] - X_train_reduced.shape[1]
    print(f"Number of features removed with corr_lim {best_corr_lim}: {removed_features}")
    print(f"Best Model: {best_model_name} with AUC: {best_auc}")

    return best_corr_lim, best_auc, best_model_name, X_train_reduced, X_test_reduced, pd.DataFrame(results)


In [290]:
# Run the function
best_corr_lim, best_auc, best_model_name, X_train_reduced, X_test_reduced, results = optimise_corr_lim_with_models(X_train, X_test, y_train, y_test)

Processing iteration 0 with corr_lim: 0.02
Processing iteration 10 with corr_lim: 0.12
Processing iteration 20 with corr_lim: 0.22
Number of features removed with corr_lim 0.02: 10
Best Model: XGBoost with AUC: 0.7248402434737563


We will now build a function that does the same as above, but instead removes features based on different thresholds of colinearity:

In [None]:
def optimise_colin_lim_with_models(X_train, X_test, y_train, y_test):
    """
    Optimises the collinearity threshold to remove highly correlated features and selects 
    the best-performing model based on AUC.

    This function:
    - Iterates over a range of collinearity limits (from 0.60 to 0.95).
    - For each limit, identifies and drops features in the training and test sets that are 
      highly correlated above the threshold.
    - Trains and evaluates two models (Random Forest and XGBoost) on the reduced datasets.
    - Selects the model and threshold combination yielding the highest AUC score.
    - Applies the optimal collinearity threshold to return final reduced datasets.

    Parameters:
        X_train (pandas.DataFrame): Training feature set.
        X_test (pandas.DataFrame): Testing feature set.
        y_train (pandas.Series): Training labels.
        y_test (pandas.Series): Testing labels.

    Returns:
        tuple: A five-part tuple containing:
            - best_colin_lim (float): The collinearity threshold that gave the best result.
            - best_auc (float): The highest AUC score achieved.
            - best_model_name (str): The name of the best-performing model.
            - X_train_reduced (pandas.DataFrame): Reduced training dataset.
            - X_test_reduced (pandas.DataFrame): Reduced testing dataset.
    """
    colin_lims = [i / 100 for i in range(60, 96)]  # 0.6 to 0.95

    best_colin_lim = None
    best_auc = 0
    best_model_name = None

    for idx, colin_lim in enumerate(colin_lims):
        if idx % 10 == 0:
            print(f"Processing iteration {idx} with colin_lim: {colin_lim}")

        # Create a correlation matrix
        corr_matrix = X_train.corr().abs()

        # Select upper triangle of correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        # Find features to drop
        to_drop = [column for column in upper.columns if any(upper[column] > colin_lim)]

        # Drop features from training and validation set
        X_train_filtered = X_train.drop(to_drop, axis=1)
        X_test_filtered = X_test.drop(to_drop, axis=1)

        # Define models
        models = {
            "RandomForest": RandomForestClassifier(random_state=42),
            "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
        }

        for model_name, model in models.items():
            # Train model
            model.fit(X_train_filtered, y_train)

            # Predict and calculate AUC on the validation set
            y_pred = model.predict_proba(X_test_filtered)[:, 1]
            auc_score = roc_auc_score(y_test, y_pred)

            if auc_score > best_auc:
                best_auc = auc_score
                best_colin_lim = colin_lim
                best_model_name = model_name

    # Apply best colinearity limit
    corr_matrix = X_train.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > best_colin_lim)]
    X_train_reduced = X_train.drop(to_drop, axis=1)
    X_test_reduced = X_test.drop(to_drop, axis=1)

    # Print the number of removed features
    removed_features = X_train.shape[1] - X_train_reduced.shape[1]
    print(f"Number of features removed with colin_lim {best_colin_lim}: {removed_features}")
    print(f"Best Model: {best_model_name} with AUC: {best_auc}")

    return best_colin_lim, best_auc, best_model_name, X_train_reduced, X_test_reduced


In [None]:
# Run the function
best_colin_lim, best_auc, best_model_name, X_train, X_test = optimise_colin_lim_with_models(X_train_reduced, X_test_reduced, y_train, y_test)
print("Best Colinearity Limit:", best_colin_lim, "with AUC:", best_auc, "using model:", best_model_name)

Processing iteration 0 with colin_lim: 0.6
Processing iteration 10 with colin_lim: 0.7
Processing iteration 20 with colin_lim: 0.8
Processing iteration 30 with colin_lim: 0.9
Number of features removed with colin_lim 0.7: 203
Best Model: RandomForest with AUC: 0.7299330790761107
Best Colinearity Limit: 0.7 with AUC: 0.7299330790761107 using model: RandomForest


We can now view how many columns reamin to build our function:

In [None]:
print(f'Number of columns remaining in X_train: {X_train.shape[1]}')

Number of columns remaining in X_train: 81


We can save these dataframes to files:

In [None]:
# Save to files
X_train.to_csv(r'train_test_datasets\X_train.csv', index=False)
X_test.to_csv(r'train_test_datasets\X_test.csv', index=False)
y_train.to_csv(r'train_test_datasets\y_train.csv', index=False)
y_test.to_csv(r'train_test_datasets\y_test.csv', index=False)
odds_win_train.to_csv(r'train_test_datasets\odds_win_train.csv', index=False)
odds_win_test.to_csv(r'train_test_datasets\odds_win_test.csv', index=False)