# More Data Collection

### Strength of Schedule Data
- https://www.teamrankings.com/college-football/ranking/schedule-strength-by-other

In [1]:
# Get general dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from bs4 import BeautifulSoup
import requests
import re

In [24]:
# Generate dates to grab strength of schedules
from datetime import date, timedelta

start_date = date(2005, 5, 1)
end_date = date(2024, 5, 1)

date_list = []
current_date = start_date
while current_date <= end_date:
    date_list.append(current_date)
    current_date = current_date.replace(year=current_date.year + 1)

# Convert dates to strings in yyyy-mm-dd format for urls
date_strings = [d.strftime('%Y-%m-%d') for d in date_list]


In [25]:
# Generate list of urls
base_url = 'https://www.teamrankings.com/college-football/ranking/schedule-strength-by-other?date='
urls = [base_url + date for date in date_strings]
urls[:2]

['https://www.teamrankings.com/college-football/ranking/schedule-strength-by-other?date=2005-05-01',
 'https://www.teamrankings.com/college-football/ranking/schedule-strength-by-other?date=2006-05-01']

In [26]:
# Set up bs4 for scraping
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

years, ranks, teams, ratings = [], [], [], []
schedule_strengths = pd.DataFrame()
# Get all tables using loop
for url in urls:
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find_all('table')[0]
    entries = table.find_all('td')

    for i, entry in enumerate(entries):
        # Add all of the entries
        n = i % 6
        try:
            if n == 0:
                ranks.append(entry.text)
            elif n == 1:
                teams.append(re.findall(r'^(.*?)\(', entry.text)[0].strip())
            elif n == 2:
                ratings.append(entry.text)
            elif n == 3:
                # Add the year
                years.append(url[-10:-6])
            else:
                continue
        except:
            break
    
schedule_strengths_df = pd.DataFrame({'year': years, 'rank': ranks, 'team': teams, 'rating': ratings})
schedule_strengths_df.head()    

Unnamed: 0,year,rank,team,rating
0,2005,1,USC,8.4
1,2005,2,Miami,6.8
2,2005,3,Oregon St,6.7
3,2005,4,Oklahoma,6.4
4,2005,5,Arizona St,6.4


In [27]:
schedule_strengths_df.tail()  

Unnamed: 0,year,rank,team,rating
2494,2024,129,Florida Intl,-11.3
2495,2024,130,N Illinois,-11.4
2496,2024,131,E Michigan,-13.3
2497,2024,132,Akron,-13.3
2498,2024,133,Kent St,-13.9


In [28]:
schedule_strengths_df.to_csv('data/schedule_strengths.csv', index=False)

### Editing the Team Names with Fuzzy Matching

In [9]:
# Install fuzzy to do name matching
!pip install --q fuzzywuzzy

In [10]:
import pandas as pd
from fuzzywuzzy import process



In [2]:
schedule_strengths_df = pd.read_csv('data/schedule_strengths.csv')
schedule_strengths_df.head()

Unnamed: 0,year,rank,team,rating
0,2005,1,USC,8.4
1,2005,2,Miami,6.8
2,2005,3,Oregon St,6.7
3,2005,4,Oklahoma,6.4
4,2005,5,Arizona St,6.4


In [64]:
# Get list of FBS team names
team_info_df = pd.read_csv('data/team_info.csv')
team_info_df.team.unique()

array(['Air Force', 'Akron', 'Alabama', 'Appalachian State', 'Arizona',
       'Arizona State', 'Arkansas', 'Arkansas State', 'Army', 'Auburn',
       'Ball State', 'Baylor', 'Boise State', 'Boston College',
       'Bowling Green', 'Buffalo', 'BYU', 'California',
       'Central Michigan', 'Charlotte', 'Cincinnati', 'Clemson',
       'Coastal Carolina', 'Colorado', 'Colorado State', 'Connecticut',
       'Duke', 'East Carolina', 'Eastern Michigan', 'Florida',
       'Florida Atlantic', 'Florida International', 'Florida State',
       'Fresno State', 'Georgia', 'Georgia Southern', 'Georgia State',
       'Georgia Tech', "Hawai'i", 'Houston', 'Illinois', 'Indiana',
       'Iowa', 'Iowa State', 'Jacksonville State', 'James Madison',
       'Kansas', 'Kansas State', 'Kent State', 'Kentucky', 'Liberty',
       'Louisiana', 'Louisiana Monroe', 'Louisiana Tech', 'Louisville',
       'LSU', 'Marshall', 'Maryland', 'Memphis', 'Miami', 'Miami (OH)',
       'Michigan', 'Michigan State', 'Middle T

In [3]:
schedule_strengths_df['team'].unique()

array(['USC', 'Miami', 'Oregon St', 'Oklahoma', 'Arizona St',
       'California', 'VA Tech', 'Texas', 'Texas Tech', 'Texas A&M',
       'N Carolina', 'Iowa', 'Stanford', 'Louisville', 'Oklahoma St',
       'Virginia', 'Notre Dame', 'LSU', 'Ohio St', 'Michigan', 'Georgia',
       'Florida St', 'Auburn', 'Tennessee', 'GA Tech', 'UCLA', 'Arkansas',
       'Purdue', 'Florida', 'BYU', 'Northwestern', 'Colorado', 'Clemson',
       'Arizona', 'NC State', 'Kansas', 'Oregon', 'Boise St', 'Maryland',
       'Washington', 'Wisconsin', 'Wash State', 'Penn St', 'Kansas St',
       'Iowa St', 'S Carolina', 'New Mexico', 'Michigan St', 'Minnesota',
       'Alabama', 'Duke', 'Mississippi', 'Indiana', 'Syracuse',
       'Fresno St', 'Wake Forest', 'Kentucky', 'LA Tech', 'Utah',
       'Colorado St', 'Nebraska', 'Cincinnati', 'San Diego St',
       'Missouri', 'Illinois', 'TX El Paso', 'Wyoming', 'W Virginia',
       'Houston', 'Baylor', 'UNLV', 'Bowling Grn', 'Marshall',
       'S Mississippi', 'Bosto

There are many team names we should edit first before doing fuzzy matching. We want "Florida St" to become "Florida State" for example. We also want "Mississippi" to correct to "Ole Miss." We'll have to do some of these by hand because Fuzzy matching will correct "Florida St" to "Florida" for example.

Ultimately, it ended up being easier to just do the names by hand.

In [7]:
replacements = {
    'Oregon St': 'Oregon State',
    'Arizona St': 'Arizona State',
    'Mississippi St': 'Mississippi State',
    'VA Tech': "Virginia Tech",
    'N Carolina': "North Carolina",
    'Oklahoma St': 'Oklahoma State',
    'Ohio St': 'Ohio State',
    'Florida St': "Florida State",
    'GA Tech': 'Georgia Tech',
    'Boise St': 'Boise State',
    'Wash St': 'Washington State',
    'Penn St': 'Penn State',
    'Kansas St': 'Kansas State',
    'Iowa St': 'Iowa State',
    'S Carolina': 'South Carolina',
    'Michigan St': 'Michigan State',
    'Mississippi': "Ole Miss",
    'Fresno St': 'Fresno State',
    'LA Tech': 'Louisiana Tech',
    'Colorado St': 'Colorado State',
    'San Diego St': 'San Diego State',
    'TX El Paso': 'UTEP',
    'W Virginia': 'West Virginia',
    'Bowling Grn': 'Bowling Green',
    'S Mississippi': 'Southern Mississippi',
    'Boston Col': "Boston College",
    'Miss State': "Mississippi State",
    'TX Christian': "TCU",
    'E Carolina': 'East Carolina',
    'S Methodist': "SMU",
    'S Florida': 'South Florida',
    'Utah St': 'Utah State',
    'San Jose St': 'San Jose State',
    'N Illinois': 'Northern Illinois',
    'N Mex State': "New Mexico State",
    'Arkansas St': 'Arkansas State',
    'Ball St': 'Ball State',
    'Kent St': 'Kent State',
    'W Michigan': 'Western Michigan',
    'Central Mich': "Central Michigan",
    'Middle Tenn': 'Middle Tennessee',
    'UL Monroe': 'Louisiana Monroe',
    'E Michigan': 'Eastern Michigan',
    'Fla Atlantic': 'Florida Atlantic',
    'Florida Intl': 'Florida International',
    'W Kentucky': 'Western Kentucky',
    'Texas St': 'Texas State',
    'U Mass': 'UMass',
    'S Alabama': 'South Alabama',
    'Georgia St': 'Georgia State',
    'GA Southern': 'Georgia Southern',
    'App St': 'Appalachian State',
    'Coastal Car': 'Coastal Carolina',
    'James Mad': 'James Madison',
    'Jksnville St': 'Jacksonville State',
    'Sam Hous St': 'Sam Houston State'
}

# Replace the values in the 'team' column
schedule_strengths_df['team'] = schedule_strengths_df['team'].replace(replacements)
schedule_strengths_df.head()

Unnamed: 0,year,rank,team,rating
0,2005,1,USC,8.4
1,2005,2,Miami,6.8
2,2005,3,Oregon State,6.7
3,2005,4,Oklahoma,6.4
4,2005,5,Arizona State,6.4


In [5]:
'''
# Use fuzzy to match team names
def match_and_replace(bad_name, correct_names):
    # Get fuzzy matches and scores. Keep when beats threshold of 80
    match, score = process.extractOne(bad_name, correct_names)
    return match if score > 80 else bad_name

# Apply to the rows of our schedule strengths
schedule_strengths_df['team'] = schedule_strengths_df['team'].apply(lambda x: match_and_replace(x, team_info_df['team'].tolist()))

schedule_strengths_df['team']
'''

"\n# Use fuzzy to match team names\ndef match_and_replace(bad_name, correct_names):\n    # Get fuzzy matches and scores. Keep when beats threshold of 80\n    match, score = process.extractOne(bad_name, correct_names)\n    return match if score > 80 else bad_name\n\n# Apply to the rows of our schedule strengths\nschedule_strengths_df['team'] = schedule_strengths_df['team'].apply(lambda x: match_and_replace(x, team_info_df['team'].tolist()))\n\nschedule_strengths_df['team']\n"

In [8]:
schedule_strengths_df.to_csv('data/schedule_strengths_fixed.csv', index=False)