# Sports Team Ranking
## Data Processing
This notebook is used for data exploration and for processing the data that will be analyzed. <br/>
<br/>
Originally coded by Rhys-Jasper León <br/>
Last updated 2025 November 10

In [1]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import numpy.linalg as la
import scipy.stats as stats

In [2]:
# import league data
df = pd.read_csv('mlb_2025_season.csv')
# turn date column into datetime object
df['date'] = pd.to_datetime(df['date'])

df.head()

Unnamed: 0,date,away,away-score,home-score,home,win,loss
0,2025-03-18,LAD,4,1,CHC,LAD,CHC
1,2025-03-19,LAD,6,3,CHC,LAD,CHC
2,2025-03-27,NYM,1,3,HOU,HOU,NYM
3,2025-03-27,BAL,12,2,TOR,BAL,TOR
4,2025-03-27,MIN,3,5,STL,STL,MIN


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2430 entries, 0 to 2429
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        2430 non-null   datetime64[ns]
 1   away        2430 non-null   object        
 2   away-score  2430 non-null   int64         
 3   home-score  2430 non-null   int64         
 4   home        2430 non-null   object        
 5   win         2430 non-null   object        
 6   loss        2430 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 133.0+ KB


In [4]:
nrows, ncols = df.shape
print(f'The dataset has {nrows} rows and {ncols} columns.')

The dataset has 2430 rows and 7 columns.


In [5]:
# import list of teams
teams = pd.read_csv('mlb_teams.csv')
teams.head()

Unnamed: 0,team,abbr
0,Arizona Diamondbacks,AZ
1,Athletics,ATH
2,Atlanta Braves,ATL
3,Baltimore Orioles,BAL
4,Boston Red Sox,BOS


In [6]:
teams_list = teams['abbr']
# teams_list

In [7]:
# count number of wins for each team
win_counts = df['win'].value_counts()
# win_counts

In [None]:
# count number of points for each team
points = pd.Series(0, index=teams_list)
# print(points)

for team in teams_list:
    for row in range(nrows):
        if df.loc[row, 'away'] == team:
            points[team] += df.loc[row, 'away-score']
        elif df.loc[row, 'home'] == team:
            points[team] += df.loc[row, 'home-score']

# points

abbr
AZ     791
ATH    733
ATL    724
BAL    677
BOS    786
CHC    793
CWS    647
CIN    716
CLE    643
COL    597
DET    758
HOU    686
KC     651
LAA    673
LAD    825
MIA    709
MIL    806
MIN    678
NYM    766
NYY    849
PHI    778
PIT    583
SD     702
SF     705
SEA    766
STL    689
TB     714
TEX    684
TOR    798
WSH    687
dtype: int64

In [None]:
# pre-allocate matrix
mlb_matrix = pd.DataFrame(0, index=teams_list, columns=teams_list)
# mlb_matrix

abbr,AZ,ATH,ATL,BAL,BOS,CHC,CWS,CIN,CLE,COL,...,PHI,PIT,SD,SF,SEA,STL,TB,TEX,TOR,WSH
abbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ATH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ATL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BAL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BOS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CWS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CIN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CLE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# fill matrix with head-to-head results
# go down teams list
for team in teams_list:
    # go through each row in dataset
    for row in range(nrows):
        # if team is away team, add away score to ij and add home score to ji
        if df.loc[row, 'away'] == team:
            mlb_matrix.loc[team, df.loc[row, 'home']] += df.loc[row, 'away-score']
            mlb_matrix.loc[df.loc[row, 'home'], team] += df.loc[row, 'home-score']
        # if team is home team, add home score to ij and add away score to ji
        elif df.loc[row, 'home'] == team:
            mlb_matrix.loc[team, df.loc[row, 'away']] += df.loc[row, 'home-score']
            mlb_matrix.loc[df.loc[row, 'away'], team] += df.loc[row, 'away-score']

mlb_matrix

abbr,AZ,ATH,ATL,BAL,BOS,CHC,CWS,CIN,CLE,COL,...,PHI,PIT,SD,SF,SEA,STL,TB,TEX,TOR,WSH
abbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AZ,0,28,72,28,38,86,34,46,20,180,...,60,26,108,126,46,66,30,54,28,60
ATH,22,0,34,60,32,18,70,42,46,36,...,28,8,30,36,98,26,54,88,68,46
ATL,68,28,0,18,54,42,42,64,34,64,...,88,32,36,42,16,64,22,18,24,152
BAL,16,40,28,0,92,14,48,28,64,56,...,16,16,34,42,42,26,142,46,160,36
BOS,26,52,42,128,0,14,56,44,84,58,...,24,20,32,28,42,72,116,46,112,54
CHC,84,70,44,18,22,0,84,110,22,58,...,44,108,50,46,40,160,28,38,12,60
CWS,16,42,46,28,62,50,0,20,64,34,...,36,54,18,14,38,24,74,48,38,46
CIN,66,18,50,74,34,106,20,0,44,64,...,52,94,48,38,36,108,30,28,46,54
CLE,20,52,10,62,60,12,112,42,0,42,...,14,36,8,16,32,12,42,36,48,52
COL,126,38,44,14,14,42,22,40,24,0,...,30,62,86,100,14,54,16,10,12,76
