# Get Men's Barttorvik Ratings

Get the Barttorvik ratings from previous seasons for features about previous team strength. Assumes all data has been copied from website (https://barttorvik.com/#) to Excel, as specified in the steps below. 

Steps before this file:

1. Copy from the website, only including games before tournament start date, excluding the initial row with D1 averages
2. Make sure the REC column in excel is text only before pasting (otherwise it tries to convert the records into a date)
3. Paste into excel with "Match Destination Formatting"
4. Save as csv into the folder

In [1]:
import pandas as pd

pd.set_option('display.max_columns', 100)

include_2024 = True

df = pd.concat(
    [
        pd.read_csv(f'../data/unprocessed/barttorvik/barttorvik_{season}.csv')
        .assign(Season=season)
        for season in range(2012, 2024 + include_2024)
        if season != 2020  # cancelled
    ],
    ignore_index=True,
)

df.insert(0, 'Season', df.pop('Season'))

df

Unnamed: 0,Season,RK,TEAM,CONF,G,REC,ADJOE,ADJDE,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,DRB,FTR,FTRD,2P%,2P%D,3P%,3P%D,3PR,3PRD,ADJ T.,WAB
0,2012,1,Kentucky,SEC,34,32–2,119.7,88.5,0.9702,53.4,41.6,17.2,18.1,38.4,31,40,25.6,52.5,38.8,37.1,32.1,27.3,30,66.1,11.3
1,2012,,"1 seed, CHAMPS",,,16–0,3,9,1,26,1,20,281,17,120,74,7,21,1,56,72,297,74,173,1
2,2012,2,Ohio St.,B10,34,27–7,115.5,85.5,0.9695,52.5,46.3,17.4,22.5,35.7,24.9,37,28.6,53.8,45.3,32.6,32.2,26.1,35.5,68.1,7.7
3,2012,,"2 seed, Final Four",,,13–5,9,1,2,50,51,25,54,51,2,157,26,11,70,242,76,316,261,77,8
4,2012,3,Kansas,B12,33,27–6,114.7,88.1,0.9542,54,43.9,19.6,20.7,34.9,28.6,41.1,34.3,54.1,40,35.8,34.7,30.5,32.2,67.9,8.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8611,2024,,,,,2–18,351,358,360,325,362,351,82,131,355,164,202,216,362,362,355,361,194,189,354
8612,2024,361,Coppin St.,MEAC,29,2-27,85,111.2,0.0437,42.1,51.3,22.9,21.8,27,38.6,31.1,38.3,41.9,51,28.3,34.5,34.4,37.6,66.4,-23
8613,2024,,,,,1–13,362,301,361,361,222,360,14,246,362,231,297,360,208,353,231,260,201,252,359
8614,2024,362,Mississippi Valley St.,SWAC,31,1-30,85.6,116,0.0297,42.5,55.6,24,17.2,27.9,35.7,33,41.6,41.9,55.6,29.2,37,30.4,36.5,64.6,-22.7


In [2]:
df['RK'] = pd.to_numeric(df['RK'], errors='coerce')

df = df.loc[df['RK'].notna(), :].reset_index(drop=True)

df['RK'] = df['RK'].astype(int)

df

Unnamed: 0,Season,RK,TEAM,CONF,G,REC,ADJOE,ADJDE,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,DRB,FTR,FTRD,2P%,2P%D,3P%,3P%D,3PR,3PRD,ADJ T.,WAB
0,2012,1,Kentucky,SEC,34,32–2,119.7,88.5,0.9702,53.4,41.6,17.2,18.1,38.4,31,40,25.6,52.5,38.8,37.1,32.1,27.3,30,66.1,11.3
1,2012,2,Ohio St.,B10,34,27–7,115.5,85.5,0.9695,52.5,46.3,17.4,22.5,35.7,24.9,37,28.6,53.8,45.3,32.6,32.2,26.1,35.5,68.1,7.7
2,2012,3,Kansas,B12,33,27–6,114.7,88.1,0.9542,54,43.9,19.6,20.7,34.9,28.6,41.1,34.3,54.1,40,35.8,34.7,30.5,32.2,67.9,8.2
3,2012,4,Michigan St.,B10,34,27–7,112.7,86.7,0.9532,52.7,43,19.8,19.7,37.2,27.5,39,34.2,51.9,42.6,36.5,29.2,27.8,35.6,66.1,8.6
4,2012,5,North Carolina,ACC,34,29–5,115.8,89.6,0.9502,50,45.1,16.2,18.5,40.5,27.7,38.1,22,49.6,43.5,34,31.9,22.9,35.9,72.7,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4225,2024,358,Stonehill,NEC,30,4-27,90.4,114.2,0.0638,46.7,52.7,19.5,16.6,22.5,31,22.6,29.4,47.9,51.7,30.1,35.9,47,46.2,68.1,-22
4226,2024,359,St. Francis PA,NEC,28,8-22,93.1,118,0.062,47.2,53,21.2,17.1,32.9,31.3,32.6,35.4,45.7,52.9,33.2,35.4,35.1,37.1,65.5,-18.6
4227,2024,360,IUPUI,Horz,29,6-26,92.1,116.9,0.061,46.5,58.2,21.3,18.5,30,35.5,33.2,33.4,49.5,59,24.7,38,24.3,37.5,67.3,-21.6
4228,2024,361,Coppin St.,MEAC,29,2-27,85,111.2,0.0437,42.1,51.3,22.9,21.8,27,38.6,31.1,38.3,41.9,51,28.3,34.5,34.4,37.6,66.4,-23


In [3]:
df['REC'] = df['REC'].str.replace('–', '-', regex=False)  # fix hyphen vs dash issues in REC column

df.insert(df.columns.get_loc('REC'), 'WINS', [i[0] for i in df['REC'].str.split('-', regex=False).to_list()])
df.insert(df.columns.get_loc('REC'), 'LOSSES', [i[1] for i in df['REC'].str.split('-', regex=False).to_list()])

df.loc[
    :, 
    ~df.columns.isin(['TEAM', 'CONF', 'REC', 'SEASON', 'RK', 'G', 'WINS', 'LOSSES'])
] = df.loc[
    :, 
    ~df.columns.isin(['TEAM', 'CONF', 'REC', 'SEASON', 'RK', 'G', 'WINS', 'LOSSES'])
].astype(float)

df[['Season', 'RK', 'G', 'WINS', 'LOSSES']] = df[['Season', 'RK', 'G', 'WINS', 'LOSSES']].astype(int)

df.insert(df.columns.get_loc('REC'), 'WIN%', df['WINS']/(df['WINS'] + df['LOSSES']))

del df['REC']

df.insert(df.columns.get_loc('ADJDE') + 1, 'ADJEM', df['ADJOE'] - df['ADJDE'])
# df.insert(df.columns.get_loc('EFGD%') + 1, 'EFG%M', df['EFG%'] - df['EFGD%'])
# df.insert(df.columns.get_loc('TORD') + 1, 'TORM', df['TOR'] - df['TORD'])
# df.insert(df.columns.get_loc('DRB') + 1, 'RBM', df['ORB'] - df['DRB'])
# df.insert(df.columns.get_loc('FTRD') + 1, 'FTRM', df['FTR'] - df['FTRD'])

df

Unnamed: 0,Season,RK,TEAM,CONF,G,WINS,LOSSES,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,DRB,FTR,FTRD,2P%,2P%D,3P%,3P%D,3PR,3PRD,ADJ T.,WAB
0,2012,1,Kentucky,SEC,34,32,2,0.941176,119.7,88.5,31.2,0.9702,53.4,41.6,17.2,18.1,38.4,31.0,40.0,25.6,52.5,38.8,37.1,32.1,27.3,30.0,66.1,11.3
1,2012,2,Ohio St.,B10,34,27,7,0.794118,115.5,85.5,30.0,0.9695,52.5,46.3,17.4,22.5,35.7,24.9,37.0,28.6,53.8,45.3,32.6,32.2,26.1,35.5,68.1,7.7
2,2012,3,Kansas,B12,33,27,6,0.818182,114.7,88.1,26.6,0.9542,54.0,43.9,19.6,20.7,34.9,28.6,41.1,34.3,54.1,40.0,35.8,34.7,30.5,32.2,67.9,8.2
3,2012,4,Michigan St.,B10,34,27,7,0.794118,112.7,86.7,26.0,0.9532,52.7,43.0,19.8,19.7,37.2,27.5,39.0,34.2,51.9,42.6,36.5,29.2,27.8,35.6,66.1,8.6
4,2012,5,North Carolina,ACC,34,29,5,0.852941,115.8,89.6,26.2,0.9502,50.0,45.1,16.2,18.5,40.5,27.7,38.1,22.0,49.6,43.5,34.0,31.9,22.9,35.9,72.7,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4225,2024,358,Stonehill,NEC,30,4,27,0.129032,90.4,114.2,-23.8,0.0638,46.7,52.7,19.5,16.6,22.5,31.0,22.6,29.4,47.9,51.7,30.1,35.9,47.0,46.2,68.1,-22.0
4226,2024,359,St. Francis PA,NEC,28,8,22,0.266667,93.1,118.0,-24.9,0.0620,47.2,53.0,21.2,17.1,32.9,31.3,32.6,35.4,45.7,52.9,33.2,35.4,35.1,37.1,65.5,-18.6
4227,2024,360,IUPUI,Horz,29,6,26,0.187500,92.1,116.9,-24.8,0.0610,46.5,58.2,21.3,18.5,30.0,35.5,33.2,33.4,49.5,59.0,24.7,38.0,24.3,37.5,67.3,-21.6
4228,2024,361,Coppin St.,MEAC,29,2,27,0.068966,85.0,111.2,-26.2,0.0437,42.1,51.3,22.9,21.8,27.0,38.6,31.1,38.3,41.9,51.0,28.3,34.5,34.4,37.6,66.4,-23.0


In [4]:
df.drop(
    columns=[
        'RK',
        'CONF',
        'G',
        'WINS',
        'LOSSES',
        '2P%',
        '2P%D',
        '3P%',
        '3P%D',
        'DRB',  # low correlation
        '3PR',  # low correlation
        '3PRD',  # low correlation
        # 'EFG%',
        # 'EFGD%',
        # 'TOR',
        # 'TORD',
        # 'FTR',
        # 'FTRD'
    ],
    inplace=True,
)

df

Unnamed: 0,Season,TEAM,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB
0,2012,Kentucky,0.941176,119.7,88.5,31.2,0.9702,53.4,41.6,17.2,18.1,38.4,40.0,25.6,66.1,11.3
1,2012,Ohio St.,0.794118,115.5,85.5,30.0,0.9695,52.5,46.3,17.4,22.5,35.7,37.0,28.6,68.1,7.7
2,2012,Kansas,0.818182,114.7,88.1,26.6,0.9542,54.0,43.9,19.6,20.7,34.9,41.1,34.3,67.9,8.2
3,2012,Michigan St.,0.794118,112.7,86.7,26.0,0.9532,52.7,43.0,19.8,19.7,37.2,39.0,34.2,66.1,8.6
4,2012,North Carolina,0.852941,115.8,89.6,26.2,0.9502,50.0,45.1,16.2,18.5,40.5,38.1,22.0,72.7,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4225,2024,Stonehill,0.129032,90.4,114.2,-23.8,0.0638,46.7,52.7,19.5,16.6,22.5,22.6,29.4,68.1,-22.0
4226,2024,St. Francis PA,0.266667,93.1,118.0,-24.9,0.0620,47.2,53.0,21.2,17.1,32.9,32.6,35.4,65.5,-18.6
4227,2024,IUPUI,0.187500,92.1,116.9,-24.8,0.0610,46.5,58.2,21.3,18.5,30.0,33.2,33.4,67.3,-21.6
4228,2024,Coppin St.,0.068966,85.0,111.2,-26.2,0.0437,42.1,51.3,22.9,21.8,27.0,31.1,38.3,66.4,-23.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4230 entries, 0 to 4229
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Season   4230 non-null   int32  
 1   TEAM     4230 non-null   object 
 2   WIN%     4230 non-null   float64
 3   ADJOE    4230 non-null   float64
 4   ADJDE    4230 non-null   float64
 5   ADJEM    4230 non-null   float64
 6   BARTHAG  4230 non-null   float64
 7   EFG%     4230 non-null   float64
 8   EFGD%    4230 non-null   float64
 9   TOR      4230 non-null   float64
 10  TORD     4230 non-null   float64
 11  ORB      4230 non-null   float64
 12  FTR      4230 non-null   float64
 13  FTRD     4230 non-null   float64
 14  ADJ T.   4230 non-null   float64
 15  WAB      4230 non-null   float64
dtypes: float64(14), int32(1), object(1)
memory usage: 512.4+ KB


In [6]:
df.to_csv(f'../data/preprocessed/barttorvik/barttorvik.csv', index=False)

'Done'

'Done'