# Sports Team Ranking
## Data Processing
This notebook is used for data exploration and for processing the data that will be analyzed. <br/>
<br/>
Originally coded by Rhys-Jasper Le√≥n <br/>
Last updated 2025 November 15

In [1]:
# import libraries
import numpy as np
import pandas as pd

In [2]:
# set verbose mode
verbose = False

In [3]:
# import league data
df = pd.read_csv('mlb_2025_season.csv')
# turn date column into datetime object
df['date'] = pd.to_datetime(df['date'])

df.head()

Unnamed: 0,date,away,away-score,home-score,home,win,loss
0,2025-03-18,LAD,4,1,CHC,LAD,CHC
1,2025-03-19,LAD,6,3,CHC,LAD,CHC
2,2025-03-27,NYM,1,3,HOU,HOU,NYM
3,2025-03-27,BAL,12,2,TOR,BAL,TOR
4,2025-03-27,MIN,3,5,STL,STL,MIN


In [4]:
nrows, ncols = df.shape
print(f'The dataset has {nrows} rows and {ncols} columns.')

The dataset has 2430 rows and 7 columns.


In [5]:
# import list of teams
teams = pd.read_csv('mlb_teams.csv')
teams.head()

Unnamed: 0,team,abbr
0,Arizona Diamondbacks,AZ
1,Athletics,ATH
2,Atlanta Braves,ATL
3,Baltimore Orioles,BAL
4,Boston Red Sox,BOS


In [6]:
teams_list = teams['abbr']
if verbose:
    print(teams_list)

In [7]:
# count number of wins for each team
win_counts = df['win'].value_counts()
if verbose:
    print(win_counts)

In [8]:
# count number of points for each team
points = pd.Series(0, index=teams_list)

for team in teams_list:
    for row in range(nrows):
        if df.loc[row, 'away'] == team:
            points[team] += df.loc[row, 'away-score']
        elif df.loc[row, 'home'] == team:
            points[team] += df.loc[row, 'home-score']

if verbose:
    print(points)

In [9]:
# pre-allocate scores matrix
scores_matrix = pd.DataFrame(0, index=teams_list, columns=teams_list)
if verbose:
    display(scores_matrix)

In [10]:
# fill matrix with head-to-head results
# go down teams list
for team in teams_list:
    # go through each row in dataset
    for row in range(nrows):
        # if team is away team, add away score to ij and add home score to ji
        if df.loc[row, 'away'] == team:
            scores_matrix.loc[team, df.loc[row, 'home']] += df.loc[row, 'away-score']
            scores_matrix.loc[df.loc[row, 'home'], team] += df.loc[row, 'home-score']
        # if team is home team, add home score to ij and add away score to ji
        elif df.loc[row, 'home'] == team:
            scores_matrix.loc[team, df.loc[row, 'away']] += df.loc[row, 'home-score']
            scores_matrix.loc[df.loc[row, 'away'], team] += df.loc[row, 'away-score']

if verbose:
    display(scores_matrix)

In [11]:
# pre-allocate games and wins matrices
games_matrix = pd.DataFrame(0, index=teams_list, columns=teams_list)
wins_matrix = pd.DataFrame(0, index=teams_list, columns=teams_list)

In [12]:
# fill matrices with head-to-head results
# go down teams list
for team in teams_list:
    # go through each row in dataset
    for row in range(nrows):
        # if team is away team, update games and wins
        if df.loc[row, 'away'] == team:
            games_matrix.loc[team, df.loc[row, 'home']] += 1
            if df.loc[row, 'win'] == team:
                wins_matrix.loc[team, df.loc[row, 'home']] += 1
        # if team is home team, update games and wins
        elif df.loc[row, 'home'] == team:
            games_matrix.loc[team, df.loc[row, 'away']] += 1
            if df.loc[row, 'win'] == team:
                wins_matrix.loc[team, df.loc[row, 'away']] += 1

if verbose:
    display(games_matrix)
    display(wins_matrix)

In [13]:
# construct A matrix
n = games_matrix.sum(axis=1)
A_matrix = wins_matrix.divide(n).fillna(0)

if verbose:
    display(A_matrix)

In [14]:
# check win percentages
row_sums = A_matrix.sum(axis=1)
if verbose:
    display(row_sums.sort_values(ascending=False))

In [15]:
# construct alternate A matrix
# pre-allocate A_matrix_alt
A_matrix_alt = pd.DataFrame(0., index=teams_list, columns=teams_list)
for team in teams_list:
    for row in range(nrows):
        # if team is away team, add outcome to ij
        if df.loc[row, 'away'] == team:
            A_matrix_alt.loc[team, df.loc[row, 'home']] += ((scores_matrix.loc[team, df.loc[row, 'home']] + 1) / (scores_matrix.loc[team, df.loc[row, 'home']] + scores_matrix.loc[df.loc[row, 'home'], team] + 2))
        # if team is home team, add outcome to ij
        elif df.loc[row, 'home'] == team:
            A_matrix_alt.loc[team, df.loc[row, 'away']] += ((scores_matrix.loc[team, df.loc[row, 'away']] + 1) / (scores_matrix.loc[team, df.loc[row, 'away']] + scores_matrix.loc[df.loc[row, 'away'], team] + 2))

if verbose:
    display(A_matrix_alt)

In [16]:
row_sums_alt = A_matrix_alt.sum(axis=1).divide(n).fillna(0)
if verbose:
    display(row_sums_alt.sort_values(ascending=False))