# Exploratory Data Analysis (2014 - Present)
In this EDA we'll look at all the data we have from 2014 onwards. This will include:

1. Recruiting class information, including blue-chip ratio
2. Previous success information, including team ELO ratings and number of wins
3. Returning talent metrics from previous season

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Get Recruiting with Season Stats Data

#### Team Records

In [2]:
# Get records for teams
records_df = pd.read_csv('./data/team_records_by_year.csv')

# Keep only records from 2010 for proper merging
records_2014_df = records_df[records_df.year >= 2014].copy()

# Remove some not needed columns
records_2014_df = records_2014_df.drop(columns=['home_wins', 'home_losses', 'away_wins', 'away_losses'], axis=1)
records_2014_df.head()

Unnamed: 0,team,team_id,year,conference,games_played,expected_wins,wins,losses
1657,Air Force,2005,2014,Mountain West,13,9.7,10,3
1658,Akron,2006,2014,Mid-American,12,5.4,5,7
1659,Alabama,333,2014,SEC,14,11.7,12,2
1660,Appalachian State,2026,2014,Sun Belt,12,8.1,7,5
1661,Arizona,12,2014,Pac-12,14,8.8,10,4


#### Recruiting

In [7]:
# Get recruiting info
recruiting_df = pd.read_csv('./data/team_recruiting_w_blue_chip_ratios.csv')

# Filter the years we need
recruiting_2014_df = recruiting_df[recruiting_df.year >= 2014].copy()

# Remove/rename some columns
recruiting_2014_df = recruiting_2014_df.drop(columns=['total', 'blue_sums', 'total_sums'])
recruiting_2014_df = recruiting_2014_df.rename(columns={'rank': 'recruiting_rank'})
recruiting_2014_df.head()

Unnamed: 0,team,year,recruiting_rank,points,blue_chip,blue_chip_ratio
654,Alabama,2014,1,319.71,20.0,0.755319
655,LSU,2014,2,299.29,16.0,0.659341
656,Ohio State,2014,3,296.08,16.0,0.673684
657,Florida State,2014,4,286.79,15.0,0.586957
658,Texas A&M,2014,5,278.08,12.0,0.45977


#### Returning Talent

In [5]:
returning_df = pd.read_csv('data/returning_players_2014.csv')
returning_df.head()

Unnamed: 0,year,team,conference,passing_usage,rushing_usage,usages
0,2014,Air Force,Mountain West,0.751,0.652,0.684
1,2014,Akron,Mid-American,1.0,0.989,0.952
2,2014,Alabama,SEC,0.078,0.911,0.589
3,2014,Arizona,Pac-12,0.0,0.048,0.142
4,2014,Arizona State,Pac-12,0.99,0.562,0.713


#### ELO/FPI Ratings

In [6]:
ratings_df = pd.read_csv('data/team_conference_ratings.csv')
ratings_2014_df = ratings_df[ratings_df.year >= 2014].copy()
ratings_2014_df.head()

Unnamed: 0,team,year,conference,elo,fpi,conference_rating
978,Air Force,2014,Mountain West,1432.0,-2.584,-5.466667
979,Akron,2014,Mid-American,1221.0,-10.663,-9.507692
980,Alabama,2014,SEC,2130.0,26.766,17.957143
981,Appalachian State,2014,Sun Belt,1475.0,-5.62,-9.990909
982,Arizona,2014,Pac-12,1723.0,14.145,9.541667


### Combine Features
1. records_2014_df
2. recruiting_2014_df
3. returning_df
4. ratings_2014_df

In [16]:
df = records_2014_df.merge(ratings_2014_df, on=['year', 'team', 'conference']) \
                        .merge(recruiting_2014_df, on=['year', 'team']) \
                        .merge(returning_df, on=['year', 'team'])
df.head()

Unnamed: 0,team,team_id,year,conference_x,games_played,expected_wins,wins,losses,elo,fpi,conference_rating,recruiting_rank,points,blue_chip,blue_chip_ratio,conference_y,passing_usage,rushing_usage,usages
0,Air Force,2005,2014,Mountain West,13,9.7,10,3,1432.0,-2.584,-5.466667,112,97.78,0.0,0.0,Mountain West,0.751,0.652,0.684
1,Akron,2006,2014,Mid-American,12,5.4,5,7,1221.0,-10.663,-9.507692,113,97.33,0.0,0.0,Mid-American,1.0,0.989,0.952
2,Alabama,333,2014,SEC,14,11.7,12,2,2130.0,26.766,17.957143,1,319.71,20.0,0.755319,SEC,0.078,0.911,0.589
3,Arizona,12,2014,Pac-12,14,8.8,10,4,1723.0,14.145,9.541667,31,213.29,5.0,0.082474,Pac-12,0.0,0.048,0.142
4,Arizona State,9,2014,Pac-12,13,9.7,10,3,1775.0,14.153,9.541667,23,223.19,7.0,0.149254,Pac-12,0.99,0.562,0.713


There seem to be some differences in conferences in the same year. Need to investigate.

In [10]:
# Note above that we didn't merge on conference, because some teams chan
(df.conference_x == df.conference_y).value_counts()

True     1181
False      23
Name: count, dtype: int64

In [19]:
df[df.conference_x != df.conference_y][['team', 'year', 'conference_x', 'conference_y']]

Unnamed: 0,team,year,conference_x,conference_y
43,Louisville,2014,ACC,American Athletic
46,Maryland,2014,Big Ten,ACC
59,New Mexico State,2014,Sun Belt,FBS Independents
76,Rutgers,2014,Big Ten,American Athletic
106,Western Kentucky,2014,Conference USA,Sun Belt
173,Navy,2015,American Athletic,FBS Independents
345,UMass,2016,FBS Independents,Mid-American
558,New Mexico State,2018,FBS Independents,Sun Belt
1003,Marshall,2022,Sun Belt,Conference USA
1027,Old Dominion,2022,Sun Belt,Conference USA
