# Feature Engineering
- [ ] Calculating ELO scores back to 2000
- [x] Calculating blue chip ratios using recruiting rankings


In [2]:
!pip install plotly

Collecting plotly
  Downloading plotly-5.22.0-py3-none-any.whl.metadata (7.1 kB)
Downloading plotly-5.22.0-py3-none-any.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: plotly
Successfully installed plotly-5.22.0


In [1]:
# Get general dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [2]:
team_recruiting_df = pd.read_csv('data/team_recruiting.csv')
player_recruiting_df = pd.read_csv('data/player_recruiting.csv').rename(columns={'school': 'team'})

In [3]:
team_recruiting_df = team_recruiting_df[team_recruiting_df['year'] >= 2007]
team_recruiting_df.head()

Unnamed: 0,team,year,rank,points
782,Florida,2007,1,313.05
783,USC,2007,2,295.06
784,Texas,2007,3,293.1
785,Tennessee,2007,4,284.42
786,LSU,2007,5,282.25


In [4]:
player_recruiting_df = player_recruiting_df[player_recruiting_df['year'] >= 2007]
player_recruiting_df.head()

Unnamed: 0,name,year,star,team,state,ranking,rating
10573,Joe McKnight,2007,5,USC,LA,1.0,0.9997
10574,Jimmy Clausen,2007,5,Notre Dame,CA,2.0,0.9987
10575,Eric Berry,2007,5,Tennessee,GA,3.0,0.9985
10576,Marvin Austin,2007,5,North Carolina,DC,4.0,0.9977
10577,Ryan Mallett,2007,5,Michigan,TX,5.0,0.9976


### Get Blue Chip Ratios for Each Team
- Start using data in 2007 to calculate BCR in 2010 and beyond

In [13]:
blue_chip_counts = player_recruiting_df[player_recruiting_df.star >= 4].groupby(['year','team'])['star'].size()
blue_chip_counts.head()

year  team         
2007  Alabama          6
      Arizona          1
      Arizona State    1
      Arkansas         1
      Auburn           8
Name: star, dtype: int64

In [14]:
total_player_counts = player_recruiting_df.groupby(['year','team'])['star'].size()
total_player_counts.head()

year  team         
2007  Air Force         8
      Akron            20
      Alabama          21
      Arizona          10
      Arizona State    18
Name: star, dtype: int64

In [15]:
blue_chip_df = team_recruiting_df.merge(blue_chip_counts, 
                    how='left', on=['year', 'team']).merge(total_player_counts, 
                                                           how='left', on=['year', 'team'])
blue_chip_df = blue_chip_df.rename(columns={'star_x': 'blue_chip', 'star_y': 'total'})
blue_chip_df[['blue_chip', 'total']] = blue_chip_df[['blue_chip', 'total']].fillna(0)
blue_chip_df

Unnamed: 0,team,year,rank,points,blue_chip,total
0,Florida,2007,1,313.05,18.0,23.0
1,USC,2007,2,295.06,14.0,18.0
2,Texas,2007,3,293.10,16.0,24.0
3,Tennessee,2007,4,284.42,10.0,27.0
4,LSU,2007,5,282.25,14.0,24.0
...,...,...,...,...,...,...
3338,Harvard,2024,196,13.44,0.0,0.0
3339,Villanova,2024,197,12.50,0.0,0.0
3340,Grand Valley State,2024,198,11.70,0.0,0.0
3341,Northern Colorado,2024,200,11.06,0.0,0.0


In [16]:
# Calculate running sums for blue chip ratio
blue_chip_df['blue_sums'] = blue_chip_df.groupby('team')['blue_chip'].rolling(window=4, min_periods=4).sum().reset_index(level=0, drop=True)
blue_chip_df['total_sums'] = blue_chip_df.groupby('team')['total'].rolling(window=4, min_periods=4).sum().reset_index(level=0, drop=True)
blue_chip_df['blue_chip_ratio'] = blue_chip_df['blue_sums']/blue_chip_df['total_sums']

# Filter out until 2010 and before 2024 to get relevant results
blue_chip_2010_df = blue_chip_df[(blue_chip_df.year >= 2010) & (blue_chip_df.year < 2024)].copy()
blue_chip_2010_df['blue_chip_ratio'] = blue_chip_2010_df['blue_chip_ratio'].fillna(0)

blue_chip_2010_df

Unnamed: 0,team,year,rank,points,blue_chip,total,blue_sums,total_sums,blue_chip_ratio
352,Florida,2010,1,324.62,22.0,28.0,64.0,83.0,0.771084
353,Texas,2010,2,312.07,21.0,24.0,67.0,88.0,0.761364
354,USC,2010,3,294.73,15.0,17.0,57.0,70.0,0.814286
355,Alabama,2010,4,284.20,14.0,22.0,61.0,100.0,0.610000
356,Oklahoma,2010,5,283.58,17.0,30.0,46.0,88.0,0.522727
...,...,...,...,...,...,...,...,...,...
3144,Kennesaw State,2023,175,12.37,0.0,1.0,0.0,15.0,0.000000
3145,East Tennessee State,2023,176,12.00,0.0,1.0,0.0,8.0,0.000000
3146,Chattanooga,2023,177,11.70,0.0,1.0,0.0,8.0,0.000000
3147,Northern Iowa,2023,178,9.00,0.0,1.0,0.0,14.0,0.000000


In [17]:
# Check results for known team
blue_chip_2010_df[blue_chip_2010_df.team == 'Ohio State']

Unnamed: 0,team,year,rank,points,blue_chip,total,blue_sums,total_sums,blue_chip_ratio
369,Ohio State,2010,18,237.38,10.0,16.0,44.0,61.0,0.721311
481,Ohio State,2011,6,278.47,13.0,25.0,52.0,80.0,0.65
617,Ohio State,2012,5,286.13,16.0,23.0,55.0,89.0,0.617978
797,Ohio State,2013,2,303.35,19.0,24.0,58.0,88.0,0.659091
1008,Ohio State,2014,3,296.08,16.0,23.0,64.0,95.0,0.673684
1243,Ohio State,2015,7,279.6,15.0,26.0,66.0,96.0,0.6875
1471,Ohio State,2016,4,289.12,17.0,25.0,67.0,98.0,0.683673
1707,Ohio State,2017,2,312.14,19.0,22.0,67.0,96.0,0.697917
1940,Ohio State,2018,2,317.06,22.0,26.0,73.0,99.0,0.737374
2181,Ohio State,2019,14,261.18,12.0,17.0,70.0,90.0,0.777778


In [18]:
blue_chip_2010_df.to_csv('data/team_recruiting_w_blue_chip_ratios.csv', index=False)