In [1]:
#analyze file
#grabbed torvik data from 2016-24 (no 20 / covid)
#looking for correlations in column fields that would predict the elite 8 flag

In [2]:
# import lib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

In [3]:
df_torvik = pd.read_csv('data/barttorvik_analyze_L0.csv')

display(df_torvik.head(5))
display(df_torvik.info())


Unnamed: 0,Rk,Team,Elite 8,Final 4,Seed,Conf,G,Win,AdjOE,AdjDE,...,FTRD,2P%,2P%D,3P%,3P%D,3PR,3PRD,Adj T.,WAB,bluePower
0,1.0,Louisville,,,4,Amer,34.0,29,119.2,88.2,...,38.2,53.0,43.9,37.1,29.1,35.6,30.7,69.0,5.4,5
1,,"4 seed, Sweet Sixteen",,,Arizo,,,15,7.0,2.0,...,123.0,23.0,28.0,62.0,3.0,97.0,96.0,60.0,15.0,1
2,2.0,Arizona,1.0,,1,P12,34.0,30,115.0,86.7,...,34.5,50.9,40.5,35.6,31.4,26.5,26.2,64.6,9.5,8
3,,"1 seed, Elite Eight",,,Flori,,,15,25.0,1.0,...,57.0,71.0,2.0,113.0,40.0,323.0,13.0,271.0,2.0,1
4,3.0,Florida,1.0,1.0,1,SEC,34.0,32,115.9,88.8,...,32.6,51.3,42.9,36.8,33.3,35.3,31.5,63.4,11.8,8


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7949 entries, 0 to 7948
Data columns (total 28 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Rk         4047 non-null   object 
 1   Team       4795 non-null   object 
 2   Elite 8    88 non-null     float64
 3   Final 4    44 non-null     float64
 4   Seed       4417 non-null   object 
 5   Conf       4047 non-null   object 
 6   G          4047 non-null   object 
 7   Win        7585 non-null   object 
 8   AdjOE      7949 non-null   object 
 9   AdjDE      7949 non-null   object 
 10  Barthag    7949 non-null   object 
 11  EFG%       7949 non-null   object 
 12  EFGD%      7949 non-null   object 
 13  TOR        7949 non-null   object 
 14  TORD       7949 non-null   object 
 15  ORB        7949 non-null   object 
 16  DRB        7949 non-null   object 
 17  FTR        7949 non-null   object 
 18  FTRD       7949 non-null   object 
 19  2P%        7949 non-null   object 
 20  2P%D    

None

In [4]:
#remove any rows where the column "Rk" is equal to NaN

df_torvik.dropna(subset=["Rk"], inplace=True)


In [5]:
#for the column "Elite 8", any row value that is NaN should be a 0 and sets all types to int

df_torvik["Elite 8"] = df_torvik["Elite 8"].fillna(0).astype(int)
df_torvik["Final 4"] = df_torvik["Final 4"].fillna(0).astype(int)

In [6]:
#delete rows where the "Rk" column has the value "Rk"

df_torvik = df_torvik[df_torvik["Rk"] != "Rk"]


In [7]:
#strip spaces from Seed

df_torvik["Seed"] = df_torvik["Seed"].astype(str).str.strip()


In [8]:
#drop Team and G columns

df_torvik = df_torvik.drop(columns=["Rk","G"])



In [9]:
#columns 5 to 25 to float using astype(float

numbers = ["Seed","Win","AdjOE","AdjDE","Barthag","EFG%","EFGD%","TOR","TORD","ORB","DRB","FTR","FTRD","2P%","2P%D","3P%","3P%D","3PR","3PRD","Adj T.","bluePower"]

df_torvik[numbers] = df_torvik[numbers].apply(pd.to_numeric, errors="coerce")


In [10]:
# Replace NaN Seed teams; these didn't make the tournament

df_torvik = df_torvik[df_torvik["Seed"].notna() & (df_torvik["Seed"] > 0)]


In [11]:
#invert the ranks/seeds, so that higher values are transformed into lower ones, aligning better with their actual value.

#df_torvik["Rk"] = df_torvik["Rk"].max() - df_torvik["Rk"] + 1
df_torvik["Seed"] = df_torvik["Seed"].max() - df_torvik["Seed"] + 1



In [12]:
# wins of 0 should be turned to 12

df_torvik.loc[df_torvik["Win"] == 0, "Win"] = 12  # Replace 0 with 12

In [13]:
# "Conf" column in df_torvik by replacing 'ind' with 'Ind'

df_torvik["Conf"] = df_torvik["Conf"].str.replace(r"^ind$", "Ind", case=False, regex=True)

In [14]:
# one-hot encode the "Conf" column

df_torvik = pd.get_dummies(df_torvik, columns=["Conf"], drop_first=False)



In [15]:
#new column named "AdjJOD" at index #7. The value is 1 if "AdjOE" is > 119 and if "AdjDE" is <94. Else the value is 0.

df_torvik.insert(6, "AdjOD", np.where((df_torvik["AdjOE"] > 116) & (df_torvik["AdjDE"] < 96), 1, 0))

#this just shows where there is overlap of elite 8 and adjOD
#df_adj_jod_1 = df_torvik[df_torvik["AdjOD"] == 1]
#display(df_adj_jod_1.head(25))


In [16]:
# drop WAB

df_torvik = df_torvik.drop(columns=["WAB"])


In [17]:
#swap T/F with 1/0

# Extract column names that start with "Conf"
CONF = [col for col in df_torvik.columns if col.startswith("Conf")]

# Print or use CONF as needed
print(CONF)

df_torvik[CONF] = df_torvik[CONF].astype(int)


['Conf_A10', 'Conf_ACC', 'Conf_AE', 'Conf_ASun', 'Conf_Amer', 'Conf_B10', 'Conf_B12', 'Conf_BE', 'Conf_BSky', 'Conf_BSth', 'Conf_BW', 'Conf_CAA', 'Conf_CUSA', 'Conf_Horz', 'Conf_Ivy', 'Conf_MAAC', 'Conf_MAC', 'Conf_MEAC', 'Conf_MVC', 'Conf_MWC', 'Conf_NEC', 'Conf_OVC', 'Conf_P12', 'Conf_Pat', 'Conf_SB', 'Conf_SC', 'Conf_SEC', 'Conf_SWAC', 'Conf_Slnd', 'Conf_Sum', 'Conf_WAC', 'Conf_WCC']


In [18]:
#To apply a log transformation to columns

df_torvik[numbers] = np.log(df_torvik[numbers] + 1)

display(df_torvik.head(5))
display(df_torvik.info())

Unnamed: 0,Team,Elite 8,Final 4,Seed,Win,AdjOE,AdjOD,AdjDE,Barthag,EFG%,...,Conf_P12,Conf_Pat,Conf_SB,Conf_SC,Conf_SEC,Conf_SWAC,Conf_Slnd,Conf_Sum,Conf_WAC,Conf_WCC
0,Louisville,0,0,2.639057,3.401197,4.789157,1,4.490881,0.677881,4.007333,...,0,0,0,0,0,0,0,0,0,0
2,Arizona,1,0,2.833213,3.433987,4.75359,0,4.473922,0.674168,3.962716,...,1,0,0,0,0,0,0,0,0,0
4,Florida,1,1,2.833213,3.496508,4.761319,0,4.497585,0.670646,3.983413,...,0,0,0,0,1,0,0,0,0,0
6,Duke,0,0,2.70805,3.295837,4.844974,0,4.593098,0.667214,4.007333,...,0,0,0,0,0,0,0,0,0,0
8,Virginia,0,0,2.833213,3.367296,4.737951,0,4.493121,0.665416,3.943522,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
Index: 748 entries, 0 to 7820
Data columns (total 57 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Team       748 non-null    object 
 1   Elite 8    748 non-null    int64  
 2   Final 4    748 non-null    int64  
 3   Seed       748 non-null    float64
 4   Win        748 non-null    float64
 5   AdjOE      748 non-null    float64
 6   AdjOD      748 non-null    int64  
 7   AdjDE      748 non-null    float64
 8   Barthag    748 non-null    float64
 9   EFG%       748 non-null    float64
 10  EFGD%      748 non-null    float64
 11  TOR        748 non-null    float64
 12  TORD       748 non-null    float64
 13  ORB        748 non-null    float64
 14  DRB        748 non-null    float64
 15  FTR        748 non-null    float64
 16  FTRD       748 non-null    float64
 17  2P%        748 non-null    float64
 18  2P%D       748 non-null    float64
 19  3P%        748 non-null    float64
 20  3P%D       748

None

In [19]:
df_torvik['bluePower_Barthag'] = df_torvik['bluePower'] * df_torvik['Barthag']
df_torvik['bluePower_Seed'] = df_torvik['bluePower'] * df_torvik['Seed']

# Display the updated df_torvik with the new interaction feature
display(df_torvik[['Team', 'bluePower', 'Barthag', 'bluePower_Barthag']].head())
display(df_torvik[['Team', 'bluePower', 'Seed', 'bluePower_Seed']].head())

Unnamed: 0,Team,bluePower,Barthag,bluePower_Barthag
0,Louisville,1.791759,0.677881,1.2146
2,Arizona,2.197225,0.674168,1.481299
4,Florida,2.197225,0.670646,1.47356
6,Duke,2.197225,0.667214,1.466019
8,Virginia,2.197225,0.665416,1.462069


Unnamed: 0,Team,bluePower,Seed,bluePower_Seed
0,Louisville,1.791759,2.639057,4.728556
2,Arizona,2.197225,2.833213,6.225206
4,Florida,2.197225,2.833213,6.225206
6,Duke,2.197225,2.70805,5.950194
8,Virginia,2.197225,2.833213,6.225206


In [20]:
# Convert to csv

df_torvik.to_csv("../L1/data/barttorvik_analyze_L1.csv")