# Data Collection

In [40]:
# Get general dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [2]:
# Import college football api
!pip install --q cfbd python-dotenv

In [2]:
# Install API from college football data
import cfbd
from dotenv import dotenv_values

# Load stuff from .env file
env_vars = dotenv_values('.env')

configuration = cfbd.Configuration()
configuration.api_key['Authorization'] = env_vars.get('CFBD_API_KEY')
configuration.api_key_prefix['Authorization'] = env_vars.get('CFBD_API_KEY_PREFIX')

api_config = cfbd.ApiClient(configuration)

In [8]:
# Create a teams api instance and a games api instance and explore
teams_api = cfbd.TeamsApi(api_config)
games_api = cfbd.GamesApi(api_config)
#players_api = cfbd.PlayersApi(api_config)

In [12]:
# Get team info
def get_team_info(**kwargs):
  team_info = kwargs['api'].get_fbs_teams()
  return team_info

# Parse the plays into a dataframe
def team_info_to_df(teams): 
  teams_dict = [dict(
    team = t.school,
    abbreviation = t.abbreviation,
    team_id = t.id,
    conference = t.conference,
    stadium_capacity = t.location.capacity,
    logo = t.logos[0]
  ) for t in teams if t.id is not None]
  teams_info_df = pd.DataFrame(teams_dict)
  return teams_info_df

In [14]:
team_info_df = team_info_to_df(
    get_team_info(api=teams_api)
)
team_info_df.head()

Unnamed: 0,team,abbreviation,team_id,conference,stadium_capacity,logo
0,Air Force,AFA,2005,Mountain West,46692.0,http://a.espncdn.com/i/teamlogos/ncaa/500/2005...
1,Akron,AKR,2006,Mid-American,30000.0,http://a.espncdn.com/i/teamlogos/ncaa/500/2006...
2,Alabama,ALA,333,SEC,101821.0,http://a.espncdn.com/i/teamlogos/ncaa/500/333.png
3,Appalachian State,APP,2026,Sun Belt,30000.0,http://a.espncdn.com/i/teamlogos/ncaa/500/2026...
4,Arizona,ARIZ,12,Pac-12,50782.0,http://a.espncdn.com/i/teamlogos/ncaa/500/12.png


In [42]:
team_info_df.to_csv('data/team_info.csv', index=False)

In [6]:
# Get team records
def get_records(**kwargs):
  records = []
  for year in kwargs['years']:
    for team in kwargs['teams']:    
      records += kwargs['api'].get_team_records(year=year, team=team)
    time.sleep(1)  
  return records

# Parse the plays into a dataframe
def records_to_df(records): 
  records_dict = [dict(
    team = r.team,
    team_id = r.team_id,
    year = r.year,
    conference = r.conference,
    games_played = r.total.games,
    expected_wins = r.expected_wins,
    wins = r.total.wins,
    losses = r.total.losses,
    home_wins = r.home_games.wins,
    home_losses = r.home_games.losses,
    away_wins = r.away_games.wins,
    away_losses = r.away_games.losses,
  ) for r in records if r.team is not None]
  records_df = pd.DataFrame(records_dict)
  return records_df 

Note: The CFBD doesn't handle numpy.int64 types. You need to use range() rather than list(np.arange()) because the elements of a list from range will be regular python ints.

In [38]:
type(list(np.arange(2000,2002))[0])

numpy.int64

In [39]:
#games_api.get_team_records(year=2023, team='South Carolina')
years = range(2000,2023+1)
teams = [team for team in team_info_df.team]

records_df = records_to_df(
    get_records(api=games_api, years=years, teams=teams)
)
records_df.head()

Unnamed: 0,team,team_id,year,conference,games_played,expected_wins,wins,losses,home_wins,home_losses,away_wins,away_losses
0,Air Force,2005,2000,Mountain West,12,0.0,9,3,5,1,3,2
1,Akron,2006,2000,Mid-American,11,0.0,6,5,3,3,3,2
2,Alabama,333,2000,SEC,11,0.0,3,8,3,3,0,5
3,Arizona,12,2000,Pac-10,11,0.0,5,6,2,4,3,2
4,Arizona State,9,2000,Pac-10,12,0.0,6,6,3,3,3,2


In [41]:
records_df.to_csv('data/team_records_by_year.csv', index=False)

In [27]:
games_api.get_team_records(year=2023, team='South Carolina')

[{'away_games': {'games': 4, 'losses': 4, 'ties': 0, 'wins': 0},
  'conference': 'SEC',
  'conference_games': {'games': 8, 'losses': 5, 'ties': 0, 'wins': 3},
  'division': 'East',
  'expected_wins': 5.6,
  'home_games': {'games': 7, 'losses': 2, 'ties': 0, 'wins': 5},
  'team': 'South Carolina',
  'team_id': 2579,
  'total': {'games': 12, 'losses': 7, 'ties': 0, 'wins': 5},
  'year': 2023}]

## On3 NIL Valuations
https://www.on3.com/nil/news/about-on3-nil-valuation-per-post-value/

In [95]:
# Function to format followers into integers
def values_to_int(string):
    if string[0] == '$':
        string = string[1::]
    if 'K' in string:
        number = float(string[:-1])  # Convert the string to float, excluding the 'K' suffix
        return int(number * 1000)    # Multiply the number by 1000 and convert it to an integer
    elif 'M' in string:
        number = float(string[:-1])
        return int(number * 1000000)
    else:
        return int(float(string))     # Convert the string to float and then to an integer

In [128]:
#scrape_NIL_100('https://www.on3.com/nil/rankings/player/nil-100/')
url = 'https://www.on3.com/nil/rankings/player/college/football/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find_all('div', {'class': 'NilPlayerRankingItem_itemContainer___Uo0_'})

In [164]:
names, ranks, schools, positions, followers, valuations = [], [], [], [], [], []
for row in table:
    
    names.append(row.find('div', {'class': 'NilPlayerRankingItem_nameYearContainer__kVMqH'}).text)
    
    ranks.append(row.find('span', {'class': "MuiTypography-root MuiTypography-body1 NilPlayerRankingItem_playerRank__NQmBq css-z52hnt"}).text)
    
    school = row.find('div', {'class': 'NilPlayerRankingItem_statusItem__gikz_'})
    if school:
        schools.append(row.find('div', {'class': 'NilPlayerRankingItem_statusItem__gikz_'}).find('img')['title'])
    else:
        schools.append(None)    
    
    positions.append(row.find('span', {'class': "MuiTypography-root MuiTypography-body1 NilPlayerRankingItem_position__WIvtI css-z52hnt"}).text)
    
    followers.append(values_to_int(row.find('p', {'class': "MuiTypography-root MuiTypography-body1 NilPlayerRankingItem_followersNumber__xG05J css-z52hnt"}).text))
    
    valuation_container = row.find('div', {'class': 'NilPlayerRankingItem_valuationContainer__nV9Sj'})
    valuation = valuation_container.find('p',{'class': 'MuiTypography-root MuiTypography-body1 NilPlayerRankingItem_valuationCurrency___Pa_U css-z52hnt'})
    if valuation:
        valuations.append(values_to_int(valuation.text))
    else:
        valuations.append(None)    

nil_df = pd.DataFrame({'name': names, 'rank': ranks, 'school': schools,
                       'position': positions, 'follwers': followers, 'valuation': valuations})
nil_df.head() 


Unnamed: 0,name,rank,school,position,follwers,valuation
0,Shedeur Sanders,1,colorado buffaloes,QB,2500000,4600000.0
1,Travis Hunter,2,colorado buffaloes,CB,2300000,2700000.0
2,Arch Manning,3,texas longhorns,QB,301000,2400000.0
3,Quinn Ewers,4,texas longhorns,QB,265000,1900000.0
4,Jalen Milroe,5,alabama crimson tide,QB,223000,1600000.0


In [167]:
nil_df.to_csv('data/nil_data.csv', index=False)