# Load and Process Match Data

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import warnings

# Ignore PerformanceWarning and UserWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

### Function to create a DataFrame from all yearly match files

First we will create a function that will allow us to create a DataFrame containing all match information. This is achieved by creating a DataFrame of each yearly dataset in the specified folder, and concatenating all of these together into one DataFrame. This data was downloaded from tennis-data.co.uk and we will only use data from 2002 as this was the first dataset that started recording Bet365 odds:

In [165]:
def get_df_from_folder(folder_path):
    """
    Reads all Excel files from a specified folder and combines them into a single Pandas DataFrame.

    Parameters:
    - folder_path (str): Path to the folder containing Excel files.

    Returns:
    - df (pd.DataFrame): A concatenated DataFrame containing all data from the Excel files in the folder.
    
    Notes:
    - Assumes all files in the folder are valid Excel files.
    - The resulting DataFrame has its index reset.
    """

    # Create an empty list to store DataFrames
    dfs = []

    # Iterate through each file in the specified folder
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)  # Construct full file path

        # Read the Excel file and append the DataFrame to the list
        df = pd.read_excel(file_path)
        dfs.append(df)

    # Concatenate all DataFrames into one
    df = pd.concat(dfs, ignore_index=True)  # ignore_index=True automatically resets index

    return df  # Return the combined DataFrame

In [166]:
# Get folder path
folder_path = 'match_datasets_atp/'

# Run above function to get dataframe
df = get_df_from_folder(folder_path)

df.head(2)

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Adelaide,AAPT Championships,2001-12-31,International,Outdoor,Hard,1st Round,3.0,Arazi H.,...,,,,,,,,,,
1,1,Adelaide,AAPT Championships,2001-12-31,International,Outdoor,Hard,1st Round,3.0,Carlsen K.,...,,,,,,,,,,


### Function to separate DataFrame into winners and losers DataFrames

Next we will create a function that:
- Extracts only the relevant columns for our analysis
- Creates two copied of the subsequent DataFrame (one for winners and one for losers)
- Creates additional features rank_diff and pts_diff which contain the difference in rank and points for the player compared to their opponent. 
- Creates a column showing whether they won or not (and does similar for whether they won in straight sets)
- Create features based on a 'score' value where a score is positive or negative based on whether a player won the match, and inversely proportional to the rank or points 
difference between competitors (and similar features for straight set scores)
- Renames columns so that they are consistent for both datasets (i.e. 'Winner' becomes 'player' for winners dataset and 'Loser' becomes 'player' for losers dataset)

In [167]:
def create_winner_loser_dfs(df):
    """
    Processes match data to create separate DataFrames for winners and losers with additional features.

    Parameters:
    - df (pd.DataFrame): A DataFrame containing tennis match results with winner/loser details.

    Returns:
    - df_winners (pd.DataFrame): A DataFrame containing match data from the winner's perspective.
    - df_losers (pd.DataFrame): A DataFrame containing match data from the loser's perspective.
    
    Notes:
    - Creates rank and points differences as features.
    - Generates scores based on match outcomes, ranking, and points.
    - Includes special scores for straight-sets wins.
    """

    # Convert relevant columns to numeric (forcing errors to NaN if they exist)
    numeric_cols = ['WRank', 'LRank', 'WPts', 'LPts', 'Wsets', 'Lsets', 'B365W', 'B365L']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

    # Select relevant columns
    df2 = df[['Winner', 'Loser', 'Series', 'Surface', 'Round', 'Date', 'WRank', 'LRank', 
              'WPts', 'LPts', 'Wsets', 'Lsets', 'B365W', 'B365L']]
    
    # Create copies for winner and loser perspectives
    df_winners = df2.copy()
    df_losers = df2.copy()

    # Drop odds columns corresponding to the opposite outcome
    df_winners = df_winners.drop('B365L', axis=1)
    df_losers = df_losers.drop('B365W', axis=1)

    # Compute ranking difference (positive if player faces a higher-ranked opponent)
    df_winners['rank_diff'] = df_winners['LRank'] - df_winners['WRank']
    df_losers['rank_diff'] = df_losers['WRank'] - df_losers['LRank']

    # Compute points difference (higher positive values indicate stronger performance)
    df_winners['pts_diff'] = df_winners['WPts'] - df_winners['LPts']
    df_losers['pts_diff'] = df_losers['LPts'] - df_losers['WPts']

    # Create target variable: 'won' (1 for winners, 0 for losers)
    df_winners['won'] = 1
    df_losers['won'] = 0

    # Compute ranking and points scores for evaluating match difficulty
    lowest_rank = df[['WRank', 'LRank']].max().max()  # Get the highest rank value
    highest_pts = df[['WPts', 'LPts']].max().max()  # Get the highest points value

    # Adjust ranking scores based on difficulty of opponent
    df_winners['rank_score'] = -df_winners['rank_diff'] + lowest_rank
    df_losers['rank_score'] = -df_losers['rank_diff'] - lowest_rank

    # Adjust points scores based on difficulty of opponent
    df_winners['pts_score'] = -df_winners['pts_diff'] + highest_pts
    df_losers['pts_score'] = -df_losers['pts_diff'] - highest_pts

    # Identify straight-set wins (winner wins without losing a set)
    df_winners['straight_sets'] = np.where(df_winners['Lsets'] == 0, 1, 0)
    df_losers['straight_sets'] = np.where(df_losers['Lsets'] == 0, -1, 0)

    # Compute ranking and points scores specifically for straight-set wins
    df_winners['ss_rank_score'] = np.where(df_winners['straight_sets'] == 1, -df_winners['rank_diff'] + lowest_rank, 0)
    df_losers['ss_rank_score'] = np.where(df_losers['straight_sets'] == -1, -df_losers['rank_diff'] - lowest_rank, 0)

    df_winners['ss_pts_score'] = np.where(df_winners['straight_sets'] == 1, -df_winners['pts_diff'] + lowest_rank, 0)
    df_losers['ss_pts_score'] = np.where(df_losers['straight_sets'] == -1, -df_losers['pts_diff'] - lowest_rank, 0)    

    # Drop set columns since they are no longer needed
    df_winners = df_winners.drop(['Wsets', 'Lsets'], axis=1)
    df_losers = df_losers.drop(['Wsets', 'Lsets'], axis=1)

    # Rename columns to unify the winner and loser perspectives
    df_winners = df_winners.rename(columns={
        'Winner': 'player', 'Loser': 'opponent', 
        'WRank': 'player_rank', 'LRank': 'opp_rank', 
        'WPts': 'player_pts', 'LPts': 'opp_pts', 
        'Date': 'date', 'B365W': 'odds'
    })
    
    df_losers = df_losers.rename(columns={
        'Loser': 'player', 'Winner': 'opponent', 
        'WRank': 'opp_rank', 'LRank': 'player_rank', 
        'WPts': 'opp_pts', 'LPts': 'player_pts', 
        'Date': 'date', 'B365L': 'odds'
    })

    return df_winners, df_losers


In [168]:
# Run above function
df_winners, df_losers = create_winner_loser_dfs(df)

In [169]:
df_winners.head(2)

Unnamed: 0,player,opponent,Series,Surface,Round,date,player_rank,opp_rank,player_pts,opp_pts,odds,rank_diff,pts_diff,won,rank_score,pts_score,straight_sets,ss_rank_score,ss_pts_score
0,Arazi H.,Kratochvil M.,International,Hard,1st Round,2001-12-31,25.0,46.0,,,,21.0,,1,4894.0,,0,0.0,0.0
1,Carlsen K.,Black W.,International,Hard,1st Round,2001-12-31,153.0,160.0,,,,7.0,,1,4908.0,,0,0.0,0.0


In [170]:
df_losers.head(2)

Unnamed: 0,opponent,player,Series,Surface,Round,date,opp_rank,player_rank,opp_pts,player_pts,odds,rank_diff,pts_diff,won,rank_score,pts_score,straight_sets,ss_rank_score,ss_pts_score
0,Arazi H.,Kratochvil M.,International,Hard,1st Round,2001-12-31,25.0,46.0,,,,-21.0,,0,-4894.0,,0,0.0,0.0
1,Carlsen K.,Black W.,International,Hard,1st Round,2001-12-31,153.0,160.0,,,,-7.0,,0,-4908.0,,0,0.0,0.0


In [171]:
print(df_winners.shape)
print(df_losers.shape)

(61431, 19)
(61431, 19)


We can see from the datasets above that we have successfully created two very similar datasets but with inverse values based on whether the data relates to the winner or the loser of the match. This means that our final dataset will essentially contain two rows per match, and the model will utimately be predicting both player's likelihood to win a match independent of each other.

### Function to conmbine winners and losers DataFrame

We can create a function to conbine these dataframes:

In [None]:


def combine_winners_losers_dfs(df_winners, df_losers):
    """
    Combines the winner and loser DataFrames into a single match-level DataFrame.

    Parameters:
    - df_winners (pd.DataFrame): DataFrame containing match data from the winner's perspective.
    - df_losers (pd.DataFrame): DataFrame containing match data from the loser's perspective.

    Returns:
    - match_df (pd.DataFrame): A unified DataFrame containing all matches with both winners and losers.
    
    Notes:
    - The function ensures consistent player/opponent name formatting.
    - The resulting DataFrame is sorted by player and date.
    - The index is reset after merging to maintain a clean DataFrame structure.
    """

    # Combine winner and loser DataFrames into one
    match_df = pd.concat([df_winners, df_losers])

    # Ensure consistent casing for player and opponent names
    match_df['player'] = match_df['player'].str.title()
    match_df['opponent'] = match_df['opponent'].str.title()

    # Sort matches by player name and match date
    match_df = match_df.sort_values(by=['player', 'date'])

    # Reset index after sorting
    match_df = match_df.reset_index(drop=True)

    return match_df


In [173]:
match_df = combine_winners_losers_dfs(df_winners, df_losers)

In [174]:
match_df.head(2)

Unnamed: 0,player,opponent,Series,Surface,Round,date,player_rank,opp_rank,player_pts,opp_pts,odds,rank_diff,pts_diff,won,rank_score,pts_score,straight_sets,ss_rank_score,ss_pts_score
0,Hajek J.,Ulihrach B.,International,Clay,1st Round,2006-09-26,79.0,362.0,506.0,92.0,1.5,283.0,414.0,1,4632.0,16536.0,1,4632.0,4501.0
1,Hajek J.,Verdasco F.,International,Clay,2nd Round,2006-09-28,79.0,28.0,506.0,1040.0,4.33,-51.0,-534.0,0,-4864.0,-16416.0,-1,-4864.0,-4381.0


### Function to ensure consistency for multiple variations of the same name

From some manual analysis, several names have been identified in the dataset which exist in different formats across different files. Therefore, we will build a function that renames these names to ensure consistency. 

In [175]:
def manually_set_names(df, old_name_col, new_name_col):
    """
    Manually updates player names in a DataFrame to a standardised format.

    Args:
        df (pd.DataFrame): The input DataFrame containing player names.
        old_name_col (str): The column name containing the original player names.
        new_name_col (str): The column name where the formatted player names will be stored.

    Returns:
        pd.DataFrame: A new DataFrame with updated player names in the specified column.
    """
    # Create a copy of the DataFrame to avoid modifying the original directly
    df_new = df.copy()

    # Manually set formatted names for specific players
    df_new.loc[df_new[old_name_col] == 'Herbert P-H.', new_name_col] = 'Herbert P.'
    df_new.loc[df_new[old_name_col] == 'Herbert P.H.', new_name_col] = 'Herbert P.'
    df_new.loc[df_new[old_name_col] == 'Herbert P-H', new_name_col] = 'Herbert P.'
    df_new.loc[df_new[old_name_col] == 'Herbert P.H', new_name_col] = 'Herbert P.'
    df_new.loc[df_new[old_name_col] == 'Moroni G.M.', new_name_col] = 'Moroni G.'
    df_new.loc[df_new[old_name_col] == 'Jc Aragone', new_name_col] = 'Aragone J.C.'
    df_new.loc[df_new[old_name_col] == 'Aragone Jc', new_name_col] = 'Aragone J.C.'
    df_new.loc[df_new[old_name_col] == 'Aragone J.', new_name_col] = 'Aragone J.C.'
    df_new.loc[df_new[old_name_col] == 'Silva F.F.', new_name_col] = 'Ferreira Silva F.'
    df_new.loc[df_new[old_name_col] == 'Silva F.', new_name_col] = 'Ferreira Silva F.'
    df_new.loc[df_new[old_name_col] == 'Struff J-L.', new_name_col] = 'Struff J.L.'
    df_new.loc[df_new[old_name_col] == 'Cedrik-Marcel Stebe', new_name_col] = 'Stebe C.M.'
    df_new.loc[df_new[old_name_col] == 'Ze Zhang', new_name_col] = 'Zhang Ze.'
    df_new.loc[df_new[old_name_col] == 'Zhang Z.', new_name_col] = 'Zhang Ze.'
    df_new.loc[df_new[old_name_col] == 'Zhang Ze', new_name_col] = 'Zhang Ze.'
    df_new.loc[df_new[old_name_col] == 'Varillas J. P.', new_name_col] = 'Varillas J.P.'
    df_new.loc[df_new[old_name_col] == 'Galan D.', new_name_col] = 'Galan D.E.'
    df_new.loc[df_new[old_name_col] == "O'Connell C.", new_name_col] = 'O Connell C.'
    df_new.loc[df_new[old_name_col] == 'Carreno-Busta P.', new_name_col] = 'Carreno Busta P.'
    df_new.loc[df_new[old_name_col] == 'Bautista Agut R.', new_name_col] = 'Bautista R.'
    df_new.loc[df_new[old_name_col] == 'Meligeni Alves F.', new_name_col] = 'Meligeni Rodrigues F'
    df_new.loc[df_new[old_name_col] == 'Dolgopolov O.', new_name_col] = 'Dolgopolov A.'
    df_new.loc[df_new[old_name_col] == 'Kohlschreiber P..', new_name_col] = 'Kohlschreiber P.'
    df_new.loc[df_new[old_name_col] == 'Ramos-Vinolas A.', new_name_col] = 'Ramos A.'
    df_new.loc[df_new[old_name_col] == 'Delbonis F.', new_name_col] = 'Del Bonis F.'
    df_new.loc[df_new[old_name_col] == 'Van D. Merwe I.', new_name_col] = 'Van Der Merwe I.'
    df_new.loc[df_new[old_name_col] == 'Viola Mat.', new_name_col] = 'Viola M.'
    df_new.loc[df_new[old_name_col] == 'Gimeno D.', new_name_col] = 'Gimeno-Traver D.'
    df_new.loc[df_new[old_name_col] == 'Guzman J.', new_name_col] = 'Guzman J.P.'
    df_new.loc[df_new[old_name_col] == 'Montanes A.', new_name_col] = 'Albert M.'
    df_new.loc[df_new[old_name_col] == 'Gallardo Valles M.', new_name_col] = 'Gallardo M.'
    df_new.loc[df_new[old_name_col] == 'Levine I.', new_name_col] = 'Levine J.'
    df_new.loc[df_new[old_name_col] == 'Dutra Da Silva R.', new_name_col] = 'Dutra Silva R.'
    df_new.loc[df_new[old_name_col] == 'Del Potro J.', new_name_col] = 'Del Potro J.M.'
    df_new.loc[df_new[old_name_col] == 'Gambill J. M.', new_name_col] = 'Gambill J.M.'
    df_new.loc[df_new[old_name_col] == 'Chela J.', new_name_col] = 'Chela J.I.'
    df_new.loc[df_new[old_name_col] == 'Lisnard J.R.', new_name_col] = 'Lisnard J.'
    df_new.loc[df_new[old_name_col] == 'Tseng C. H.', new_name_col] = 'Tseng C.H.'

    # Return the modified DataFrame
    return df_new


We can then run the function on both the player and the opponent columns of match_df

In [176]:
# Run functions
match_df = manually_set_names(match_df, 'player', 'player')
match_df = manually_set_names(match_df, 'opponent', 'opponent')

### Function to add fields for scores for previous matches

We previously made the features rank_score, pts_score etc which are based on the rank and points differences of the player and the opponent and whether the player won. However, if we are to create a model to predict the outcome of a match, we won't be able to use these features since they depend on the outcome. Therefore, instead we were create features showing the rank and points scores for the previous matches. 

In this function we create some additional features showing the previously engineered scores for each of the previous 10 matches for that player:

In [None]:
def create_prior_match_features(df):
    """
    Adds prior match statistics as new features to the DataFrame for each player.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing match-level data.

    Returns:
    - df_new (pd.DataFrame): A DataFrame with additional features indicating past match performance.

    Notes:
    - This function creates new columns for up to 10 previous matches per player.
    - The new features include past 'rank_score', 'pts_score', 'ss_rank_score', and 'ss_pts_score'.
    - Each feature is suffixed with "_X_match_ago", where X is the number of matches ago.
    """

    # Create a copy of the DataFrame to avoid modifying the original
    df_new = df.copy()

    # Features to track from previous matches
    features = ['rank_score', 'pts_score', 'ss_rank_score', 'ss_pts_score']

    # Iterate over unique players in the dataset
    for player in df_new['player'].unique():
        # Filter dataframe for the current player
        player_matches = df_new[df_new['player'] == player]

        # Shift the selected features to track past match performance
        previous_features = player_matches[features].shift(1)

        # Iterate over the number of previous matches (up to 10)
        for i in range(1, 11):
            for feature in features:
                df_new.loc[player_matches.index, f'{feature}_{i}_match_ago'] = previous_features[feature]
            
            # Shift again for the next iteration (moving further back in history)
            previous_features = previous_features.shift(1)

    return df_new


In [178]:
# Run the function
match_df = create_prior_match_features(match_df)

We can view some of the newly created fields below:

In [179]:
match_df.iloc[:,-40:-30].sample(3)

Unnamed: 0,rank_score_1_match_ago,pts_score_1_match_ago,ss_rank_score_1_match_ago,ss_pts_score_1_match_ago,rank_score_2_match_ago,pts_score_2_match_ago,ss_rank_score_2_match_ago,ss_pts_score_2_match_ago,rank_score_3_match_ago,pts_score_3_match_ago
86851,,,,,,,,,,
21874,-4864.0,-16257.0,-4864.0,-4222.0,-4894.0,-16785.0,-4894.0,-4750.0,4870.0,16800.0
37772,4904.0,16908.0,4904.0,4873.0,-4830.0,-15872.0,0.0,0.0,4829.0,16697.0


### Function to keep naming consistency with player dataset

Next, we will create a function to rename and reformat some of the data to make them consistent with our player dataset. We will also group the player and opponent ranks so that this also matches with the groupings in the player dataset:

In [None]:
def process_dataframe(df):
    """
    Cleans and standardises match data for further analysis.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing match data.

    Returns:
    - df_new (pd.DataFrame): A cleaned and standardised DataFrame with added features.

    Notes:
    - Converts date values into a 'year' column.
    - Standardises the 'Surface' column by converting values to lowercase.
    - Strips whitespace from player and opponent names.
    - Renames round and series names to align with standard naming conventions.
    - Removes round-robin matches from the dataset.
    - Categorises opponent rankings into predefined groups (e.g., vsTop5, vsTop10, etc.).
    """

    # Create a copy of the DataFrame to avoid modifying the original
    df_new = df.copy()

    # Create a year column from the date
    df_new['year'] = df_new['date'].dt.year

    # Convert surface to lowercase
    df_new['Surface'] = df_new['Surface'].str.lower()

    # Remove whitespace from player and opponent names
    df_new['player'] = df_new['player'].str.strip()
    df_new['opponent'] = df_new['opponent'].str.strip()

    # Rename rounds to standardize with player data
    df_new['Round'] = df_new['Round'].replace({
        '1st Round': 'round1', 
        '2nd Round': 'round2',
        '3rd Round': 'round3',
        '4th Round': 'round4', 
        'Quarterfinals': 'quarterfinal',
        'Semifinals': 'semifinal',
        'The Final': 'final'
    })

    # Rename series names to match standard categories
    df_new['Series'] = df_new['Series'].replace({
        'Masters 1000': 'masters', 
        'Grand Slam': 'grand_slam',
        'ATP250': 'main_tour',
        'ATP500': 'main_tour', 
        'Masters Cup': 'cup'
    })

    # Remove round-robin matches
    df_new = df_new[df_new['Round'] != 'Round Robin']

    # Function to categorise player rank into predefined groups
    def categorise_rank(rank):
        if rank <= 5:
            return 'vsTop5'
        elif rank <= 10:
            return 'vsTop10'
        elif rank <= 20:
            return 'vsTop20'
        elif rank <= 50:
            return 'vsTop50'
        elif rank <= 100:
            return 'vsTop100'
        else:
            return 'vsOther'

    # Apply ranking categorisation to player and opponent ranks
    df_new['player_vs_top'] = df_new['opp_rank'].apply(categorise_rank)
    df_new['opp_vs_top'] = df_new['player_rank'].apply(categorise_rank)

    return df_new

In [181]:
# Run the function
match_df = process_dataframe(match_df)

### Function to show columns with most missing values

We can then create a function to show us the columns of the dataset with the most null values:

In [None]:
def show_columns_with_most_nulls(df, top_n=10):
    """
    Displays the columns with the most null values in a DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame to analyse.
    - top_n (int, optional): The number of columns to display (default is 10).

    Returns:
    - null_counts (pd.DataFrame): A DataFrame containing the top columns with the most null values.

    Notes:
    - Calculates the number and percentage of null values for each column.
    - Sorts columns in descending order based on the number of missing values.
    - Prints the results and returns a DataFrame with the top columns.
    """

    # Count null values for each column
    null_counts = df.isnull().sum()

    # Compute percentage of missing values
    null_percent = (null_counts / len(df)) * 100

    # Create a DataFrame with the results
    null_summary = pd.DataFrame({
        'Null Count': null_counts,
        'Null Percentage': null_percent
    })

    # Sort by the highest null count
    null_summary = null_summary.sort_values(by='Null Count', ascending=False)

    # Display only the top_n columns with the most nulls
    top_nulls = null_summary.head(top_n)

    # Print the results
    print(f"\nTop {top_n} columns with the most null values:\n")

    return top_nulls


In [183]:
show_columns_with_most_nulls(match_df)


Top 10 columns with the most null values:



Unnamed: 0,Null Count,Null Percentage
pts_score_10_match_ago,28931,23.700336
pts_score_9_match_ago,28319,23.198984
pts_score_8_match_ago,27674,22.670599
pts_score_7_match_ago,27005,22.122553
pts_score_6_match_ago,26312,21.554846
pts_score_5_match_ago,25590,20.963382
pts_score_4_match_ago,24830,20.340788
pts_score_3_match_ago,24008,19.667404
pts_score_2_match_ago,23094,18.918653
pts_score_1_match_ago,22035,18.051118


### Function to drop null rows

It is to be expected thar the columns with the most null values are those showing scores for previous matches, since this would include up to the first 10 entries of each player, as well as if their opponent had fewer than 10 games in the dataset. However, since we only want to run this model on matches where we have sufficient data on all players, we will build a function to remove all rows with null values from the dataset. This should still leave us with a good sized dataset to build a model with:

In [None]:
def drop_null_rows(df):
    """
    Removes rows containing any null values from the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame from which null rows will be removed.

    Returns:
    - df (pd.DataFrame): The cleaned DataFrame with no null values.

    Notes:
    - Prints the initial and final row count to show how many rows were removed.
    - Ensures that no rows with missing data remain after execution.
    """

    # Count the initial number of rows
    initial_count = len(df)
    print(f"Previous number of rows in match_df: {initial_count}")
    
    # Drop rows containing any null values
    df = df.dropna()
    
    # Count the number of rows after removal
    final_count = len(df)
    
    # Calculate and print the number of removed rows
    removed_count = initial_count - final_count
    print(f"Number of removed rows: {removed_count}")
    print(f"New number of rows in match_df: {final_count}")  # Extra print statement
    
    return df


In [185]:
match_df = drop_null_rows(match_df)

Previous number of rows in match_df: 122070
Number of removed rows: 30843
New number of rows in match_df: 91227


We can now save and reload the new match dataset for future use:

In [None]:
# Save to file
match_df.to_csv('datasets\match_df.csv', index=False)