# NCAA Website 

*Webscraping and finding out what I need...*

I used the library BeautifulSoup to programatically get the results of each game in the 2019 college season. I used the <a href="https://www.ncaa.com/scoreboard/football/fbs/2019/01">NCAA's website</a> to find HTML tags holding the winner and loser's score and rank for each game, which I write into a DataFrame. I use the same process to collect data on which schools belong to which conference from <a href="https://www.ncaa.com/standings/football/fbs">this page</a>.

In [1]:
# Import libraries
import os
import logging
import simplejson
import json
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import cssutils

from datetime import datetime

import pandas as pd
import numpy as np

import networkx as nx

cssutils.log.setLevel(logging.CRITICAL)

%load_ext nb_black

<IPython.core.display.Javascript object>

## Collect Data on Games

### 1 Sample

#### Scoreboard (Home) Page

In [2]:
# Read page
url = "https://www.ncaa.com/scoreboard/football/fbs/2019/01/all-conf"
response = requests.get(url)
soup = BeautifulSoup(response.content)

<IPython.core.display.Javascript object>

In [3]:
# Get date sections
date_sections = soup.find_all("div", attrs={"class": "gamePod_content-division"})

# Get 1 date section
date_section = date_sections[0]

# Get date from date section
date = date_section.find_all("h6")[0].contents[0]
print(date)

# Get games within date section
games = date_section.find_all(
    "div", attrs={"class": "gamePod gamePod-type-game status-final"}
)

# Get 1 game from games
game = games[0]

Saturday, August 24, 2019


<IPython.core.display.Javascript object>

In [4]:
# Get results
winner = game.find("li", attrs={"class": "winner"})
loser = game.find("li", attrs={"class": ""})

# Get states for winner and loser
sides = {"winner": winner, "loser": loser}
for side_name in sides:
    side = sides.get(side_name)
    team = side.find("span", attrs={"class": "gamePod-game-team-name"}).contents[0]
    score_pre = side.find("span", attrs={"class": "gamePod-game-team-score"}).contents
    score = score_pre[0] if len(score_pre) > 0 else np.nan
    rank_pre = side.find("span", attrs={"class": "gamePod-game-team-rank"}).contents
    rank = rank_pre[0] if len(rank_pre) > 0 else np.nan
    print(side_name, team, score, rank)

winner Florida 24 8
loser Miami (FL) 20 nan


<IPython.core.display.Javascript object>

In [5]:
# Get image of school logo (winner)
# This can be saved in some image folder
image_link = winner.find("img", attrs={"class": "gamePod-game-team-logo"})["src"]

# urlretrieve(image_link, "test.svg")
print(image_link)

https://i.turner.ncaa.com/sites/default/files/images/logos/schools/bgl/florida.svg


<IPython.core.display.Javascript object>

In [6]:
# Find link to dive into a specific game
link = "https://www.ncaa.com" + game.find("a", attrs={"class": "gamePod-link"})["href"]
link

'https://www.ncaa.com/game/3785117'

<IPython.core.display.Javascript object>

#### Game (Deep-Dive) Page

In [7]:
# Read page
url = link
response = requests.get(url)
soup = BeautifulSoup(response.content)

<IPython.core.display.Javascript object>

In [8]:
# Get top summary banner
banner = soup.find("div", attrs={"class", "gamecenter-game-banner"})

# See which team is winner (away or home)
winner_list = banner.select("span.winner")[0]
winner_team = "away" if "away" in winner_list["class"] else "home"
winner_score = int(winner_list.contents[0].replace("\n", "").strip())
print(winner_team)
print(winner_score)

# Get css
css = soup.select("style", type="text/css")

home
24


<IPython.core.display.Javascript object>

In [9]:
# Parse css for colors
selectors = {}
for styletag in soup.select("style"):
    css = cssutils.parseString(styletag.encode_contents())
    for rule in css:
        if rule.type == rule.STYLE_RULE:
            style = rule.selectorText
            selectors[style] = {}
            for item in rule.style:
                propertyname = item.name
                value = item.value
                selectors[style][propertyname] = value


selectors
print(selectors.get(".homeTeam-fill-primary_color").get("fill", "#fff"))
print(selectors.get(".homeTeam-fill-secondary_color").get("fill", "#fff"))
print(selectors.get(".awayTeam-fill-primary_color").get("fill", "#fff"))
print(selectors.get(".awayTeam-fill-secondary_color").get("fill", "#fff"))

#003087
#00266c
#154734
#103829


<IPython.core.display.Javascript object>

In [18]:
# Get images
soup.find("div", attrs={"class", "away"}).img["src"]
# soup.find("div", attrs={"class", "home"}).img

'//i.turner.ncaa.com/sites/default/files/images/logos/schools/bgd/miami-fl.svg'

<IPython.core.display.Javascript object>

We can build the following summary from the NCAA website:

**Source: Scoreboard**
* Week Number
* Date
* Winner Team
* Winner Score
* Winner Rank
* Winner logo
* Loser Team
* Loser Score
* Loser Rank
* Loser logo

**Source: Game**
* Winner primary color
* Winner secondary color
* Loser primary color
* Loser secondary color
* Away team
* Home team


#### Navigation Bar

In [10]:
# Read page
url = "https://www.ncaa.com/scoreboard/football/fbs/2019/01/all-conf"
response = requests.get(url)
soup = BeautifulSoup(response.content)

# Get navigation bar
navbar = soup.find_all("div", attrs={"class": ["scoreboardDateNav-date", "hasGames"]})

# Get 1 week
week = navbar[0]

<IPython.core.display.Javascript object>

In [11]:
week_link = "https://www.ncaa.com" + week.select("a")[0]["href"]
print(week_link)

week_num = week.find("span", attrs={"class": "scoreboardDateNav-dayNumber"}).contents[0]
print(week_num)

https://www.ncaa.com/scoreboard/football/fbs/2019/01
1


<IPython.core.display.Javascript object>

### Automate All Samples

In [22]:
# Set parameter
year = 2019
folder_name = "ncaa_team_logo"

<IPython.core.display.Javascript object>

In [23]:
# Set variables
df = []

url_home = f"https://www.ncaa.com/scoreboard/football/fbs/{year}/01/all-conf"

# Create folder to store images
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

<IPython.core.display.Javascript object>

In [31]:
##################
# Navigation Bar #
##################

response = requests.get(url_home)
soup = BeautifulSoup(response.content)

# Get navigation bar
navbar = soup.find_all("div", attrs={"class": ["scoreboardDateNav-date", "hasGames"]})
weeks = {}

for index, week in enumerate(navbar):
    original_link = "https://www.ncaa.com" + week.select("a")[0]["href"]
    # Not sure why, but BeautifulSoup isn't parsing as expected
    if index == len(navbar) - 1:
        tail = "P"
    else:
        tail = ("0" + str(index + 1))[-2:]
    # Add {week_number : link} to dictionary
    weeks[index + 1] = (
        original_link[: -len(original_link.split("/")[-1])] + tail + "/all-conf"
    )

###################
# Scoreboard Home #
###################

for week_num in weeks:

    # Read page
    url = weeks[week_num]
    response = requests.get(url)
    soup = BeautifulSoup(response.content)

    # Get date sections
    date_sections = soup.find_all("div", attrs={"class": "gamePod_content-division"})

    for date_section in date_sections:

        # Get date from date section
        date = date_section.find_all("h6")[0].contents[0]

        # Get games within date section
        games = date_section.find_all(
            "div", attrs={"class": "gamePod gamePod-type-game status-final"}
        )

        for game in games:

            # Get results
            winner = game.find("li", attrs={"class": "winner"})
            loser = game.find("li", attrs={"class": ""})

            # Get states for winner and loser
            sides = {"winner": winner, "loser": loser}
            stats = {}
            for side_name in sides:

                side = sides.get(side_name)

                team = side.find(
                    "span", attrs={"class": "gamePod-game-team-name"}
                ).contents[0]
                score_pre = side.find(
                    "span", attrs={"class": "gamePod-game-team-score"}
                ).contents
                score = score_pre[0] if len(score_pre) > 0 else np.nan
                rank_pre = side.find(
                    "span", attrs={"class": "gamePod-game-team-rank"}
                ).contents
                rank = rank_pre[0] if len(rank_pre) > 0 else np.nan

                stats[side_name] = {
                    "team": team,
                    "score": score,
                    "rank": rank,
                }

            #                 ##############
            #                 # Save Image #
            #                 ##############

            #                 if not os.path.exists(f"{folder_name}/{team}.svg"):
            #                     image_link = side.find(
            #                         "img", attrs={"class": "gamePod-game-team-logo"}
            #                     )["src"]
            #                     urlretrieve(image_link, f"{folder_name}/{team}.svg")

            ##############
            # Get Colors #
            ##############

            # Find link to dive into a specific game
            link = (
                "https://www.ncaa.com"
                + game.find("a", attrs={"class": "gamePod-link"})["href"]
            )

            # Read page
            url = link
            response = requests.get(url)
            soup = BeautifulSoup(response.content)

            # Get top summary banner
            banner = soup.find("div", attrs={"class", "gamecenter-game-banner"})

            # See which team is winner (away or home)
            winner_list = banner.select("span.winner")[0]
            winner_team = "away" if "away" in winner_list["class"] else "home"
            loser_team = "home" if "away" in winner_list["class"] else "away"
            #             winner_score = int(winner_list.contents[0].replace("\n", "").strip())

            # Get css
            css = soup.select("style", type="text/css")

            # Parse css for colors
            selectors = {}
            for styletag in soup.select("style"):
                css = cssutils.parseString(styletag.encode_contents())
                for rule in css:
                    if rule.type == rule.STYLE_RULE:
                        style = rule.selectorText
                        selectors[style] = {}
                        for item in rule.style:
                            propertyname = item.name
                            value = item.value
                            selectors[style][propertyname] = value

            home_pc = selectors.get(".homeTeam-fill-primary_color").get("fill", "#fff")
            home_sc = selectors.get(".homeTeam-fill-secondary_color").get(
                "fill", "#fff"
            )
            away_pc = selectors.get(".awayTeam-fill-primary_color").get("fill", "#fff")
            away_sc = selectors.get(".awayTeam-fill-secondary_color").get(
                "fill", "#fff"
            )

            ##############
            # Save Image #
            ##############

            sides = {
                "home": stats.get("winner").get("team")
                if winner_team == "home"
                else stats.get("loser").get("team"),
                "away": stats.get("winner").get("team")
                if winner_team == "away"
                else stats.get("loser").get("team"),
            }

            for side in sides:
                team = sides.get(side)
                if not os.path.exists(f"{folder_name}/{team}.svg"):
                    image_link = (
                        f"https:{banner.find('div', attrs={'class': side}).img['src']}"
                    )
                    urlretrieve(image_link, f"{folder_name}/{team}.svg")

            # Add to list
            df.append(
                {
                    "week": week_num,
                    "date": date,
                    "winner_team": stats.get("winner").get("team"),
                    "winner_score": stats.get("winner").get("score"),
                    "winner_rank": stats.get("winner").get("rank"),
                    "winner_side": winner_team,
                    "winner_primary_color": home_pc
                    if winner_team == "home"
                    else away_pc,
                    "winner_secondary_color": home_sc
                    if winner_team == "home"
                    else away_sc,
                    "loser_team": stats.get("loser").get("team"),
                    "loser_score": stats.get("loser").get("score"),
                    "loser_rank": stats.get("loser").get("rank"),
                    "loser_side": loser_team,
                    "loser_primary_color": away_pc
                    if winner_team == "home"
                    else home_pc,
                    "loser_secondary_color": away_sc
                    if winner_team == "home"
                    else home_sc,
                }
            )

df = pd.DataFrame(df)
df.head()

Unnamed: 0,week,date,winner_team,winner_score,winner_rank,winner_side,winner_primary_color,winner_secondary_color,loser_team,loser_score,loser_rank,loser_side,loser_primary_color,loser_secondary_color
0,1,"Saturday, August 24, 2019",Florida,24,8.0,home,#003087,#00266c,Miami (FL),20,,away,#154734,#103829
1,1,"Saturday, August 24, 2019",Hawaii,45,,home,#154734,#103829,Arizona,38,,away,#00205B,#001948
2,1,"Thursday, August 29, 2019",Buffalo,38,,home,#0057B8,#004593,Robert Morris,10,,away,#041E42,#031834
3,1,"Thursday, August 29, 2019",Cincinnati,24,,home,#C8102E,#a00c24,UCLA,14,,away,#0072CE,#005ba4
4,1,"Thursday, August 29, 2019",Bowling Green,46,,home,#FE5000,#cb4000,Morgan St.,3,,away,#FC4C02,#c93c01


<IPython.core.display.Javascript object>

In [32]:
df.to_csv("ncaa_data.csv", index=False)

<IPython.core.display.Javascript object>

## Collect Data on Conferences

In [33]:
# Read page content
url = "https://www.ncaa.com/standings/football/fbs"
response = requests.get(url)
soup = BeautifulSoup(response.content)

# Get list of conferences
conferences = soup.find_all("figure", attrs={"class": "standings-conference"})
all_teams = soup.find_all("div", attrs={"class": "table-wrap"})
conference_df = []

for i in range(len(conferences)):

    # Get name of conference
    conference = conferences[i].contents[1]

    # Get all teams
    teams = all_teams[i].find_all("td", attrs={"class": "standings-team"})
    for j in range(len(teams)):
        team = teams[j].contents[0]
        conference_df.append(
            {"conference": conference, "team": team,}
        )

conference_df = pd.DataFrame(conference_df)
conference_df.head(2)

Unnamed: 0,conference,team
0,Atlantic Coast,Notre Dame
1,Atlantic Coast,Clemson


<IPython.core.display.Javascript object>

In [34]:
conference_df.to_csv("ncaa_data_conference.csv", index=False)

<IPython.core.display.Javascript object>

## Combine datasets

In [2]:
# Reload datasets
games_df = pd.read_csv("ncaa_data.csv")
conferences_df = pd.read_csv("ncaa_data_conference.csv")

<IPython.core.display.Javascript object>

In [5]:
# See that there are many teams that are not matched due to inconsistent names
unmatched_teams = conferences_df.merge(
    games_df, how="left", left_on="team", right_on="winner_team"
)[
    conferences_df.merge(
        games_df, how="left", left_on="team", right_on="winner_team"
    ).isnull()["winner_team"]
][
    ["conference", "team"]
]

unmatched_teams

Unnamed: 0,conference,team
36,Atlantic Coast,North Carolina State
85,Atlantic Coast,Florida State
175,American Athletic Conference,South Florida
176,Big 12,Iowa State
190,Big 12,Oklahoma State
208,Big 12,Kansas State
235,Big Ten,Ohio State
236,Big Ten,Penn State
251,Big Ten,Michigan State
309,Conference USA,Florida Atlantic


<IPython.core.display.Javascript object>

In [10]:
# Clean names
def replace_all(text):
    text = text.replace("St.", "State")
    text = text.replace("Mich.", "Michigan")
    text = text.replace("Tenn.", "Tennessee")
    text = text.replace("Ky.", "Kentucky")
    text = text.replace("Ga.", "Georgia")
    text = text.replace("Ill.", "Illinois")
    text = text.replace("NC", "North Carolina")
    text = text.replace("South Fla.", "South Florida")
    text = text.replace("Fla. Atlantic", "Florida Atlantic")
    text = text.replace("Miss.", "Miss")
    if (
        text == "Louisiana"
    ):  # If text.replace() is used then "Lousiana Tech" will be unintentionally changed
        text = "Louisiana-Lafayette"
    text = text.replace("La.-Monroe", "Louisiana-Monroe")
    text = text.replace("BYU", "Brigham Young")
    text = text.replace("Army West Point", "Army")
    text = text.replace("(Fla.)", "(FL)").replace("(Ohio)", "(OH)")
    text = text.replace("FIU", "Florida International")
    text = text.replace("UConn", "Connecticut")
    text = text.replace("Southern California", "USC")
    return text.strip()


# Replace names in original table
games_df["winner_team_clean"] = games_df["winner_team"].apply(lambda x: replace_all(x))
games_df["loser_team_clean"] = games_df["loser_team"].apply(lambda x: replace_all(x))

# Check that there are many teams that are not matched due to inconsistent names
unmatched_teams1 = conferences_df.merge(
    games_df, how="left", left_on="team", right_on="winner_team_clean"
)[
    conferences_df.merge(
        games_df, how="left", left_on="team", right_on="winner_team_clean"
    ).isnull()["winner_team_clean"]
][
    ["conference", "team"]
]

unmatched_teams2 = conferences_df.merge(
    games_df, how="left", left_on="team", right_on="loser_team_clean"
)[
    conferences_df.merge(
        games_df, how="left", left_on="team", right_on="loser_team_clean"
    ).isnull()["loser_team_clean"]
][
    ["conference", "team"]
]

pd.merge(unmatched_teams1, unmatched_teams2)

Unnamed: 0,conference,team


<IPython.core.display.Javascript object>

In [11]:
# Join all columns
df = games_df.merge(
    conferences_df,
    how="left",
    left_on="winner_team_clean",
    right_on="team",
    suffixes=("", "_winner"),
).merge(
    right=conferences_df,
    how="left",
    left_on="loser_team_clean",
    right_on="team",
    suffixes=("", "_loser"),
)

# Drop unnecessary columns
df.drop(["team", "team_loser"], axis=1, inplace=True)

# Rename columns
df.rename(
    columns={"conference": "winner_conference", "conference_loser": "loser_conference"},
    inplace=True,
)

df.head()

Unnamed: 0,week,date,winner_team,winner_score,winner_rank,winner_side,winner_primary_color,winner_secondary_color,loser_team,loser_score,loser_rank,loser_side,loser_primary_color,loser_secondary_color,winner_team_clean,loser_team_clean,winner_conference,loser_conference
0,1,"Saturday, August 24, 2019",Florida,24,8.0,home,#003087,#00266c,Miami (FL),20,,away,#154734,#103829,Florida,Miami (FL),Southeastern,Atlantic Coast
1,1,"Saturday, August 24, 2019",Hawaii,45,,home,#154734,#103829,Arizona,38,,away,#00205B,#001948,Hawaii,Arizona,Mountain West,Pac-12
2,1,"Thursday, August 29, 2019",Buffalo,38,,home,#0057B8,#004593,Robert Morris,10,,away,#041E42,#031834,Buffalo,Robert Morris,Mid-American,
3,1,"Thursday, August 29, 2019",Cincinnati,24,,home,#C8102E,#a00c24,UCLA,14,,away,#0072CE,#005ba4,Cincinnati,UCLA,American Athletic Conference,Pac-12
4,1,"Thursday, August 29, 2019",Bowling Green,46,,home,#FE5000,#cb4000,Morgan St.,3,,away,#FC4C02,#c93c01,Bowling Green,Morgan State,Mid-American,


<IPython.core.display.Javascript object>

In [12]:
df.to_csv("ncaa_data_transformed.csv", index=False)

<IPython.core.display.Javascript object>

## Explore Networkx

Determine placement of nodes (teams) by placing teams that have played each other closer together. Detect whether each conference has a full cycle.

In [21]:
# Reload datasets
df = pd.read_csv("ncaa_data_transformed.csv")
conferences_df = pd.read_csv("ncaa_data_conference.csv")

<IPython.core.display.Javascript object>

In [22]:
# Add index for each team belonging to a conference

df["winner_conference_index"] = np.zeros(df.shape[0])
df["loser_conference_index"] = np.zeros(df.shape[0])

conference_counts = conferences_df.groupby("conference").count()["team"]
conferences = list(df[~df.winner_conference.isna()].winner_conference.unique())

for conference in conferences:

    G = nx.from_pandas_edgelist(
        df[(df.winner_conference == conference) & (df.loser_conference == conference)],
        source="loser_team_clean",
        target="winner_team_clean",
        create_using=nx.DiGraph(),
    )

    cycles = list(nx.simple_cycles(G))

    # If no cycles are found
    if len(cycles) == 0:
        
        # Create empty dict
        mapping = {}
        
    # Else if there are cycles, then get the biggest cycle first
    else:
        
        # Add longest cycle to mapping
        cycles.sort(key=len)
        cycle = cycles[-1]
        mapping = {name: index for index, name in enumerate(cycle)}
        
        # Remove nodes we used
        G.remove_nodes_from(cycle)
    
    # Clean up remaining based on longest path
    while G.number_of_nodes() > 0:
        
        # Add longest path to mapping
        increment = len(mapping)
        path = nx.dag_longest_path(G)
        mapping.update({name: index + increment for index, name in enumerate(path)})
        
        # Remove nodes we used
        G.remove_nodes_from(path)
    
    # Update index
    df.loc[df["winner_team_clean"].isin(mapping.keys()), "winner_conference_index"] = df[df["winner_team_clean"].isin(mapping.keys())]["winner_team_clean"].apply(lambda x: mapping.get(x, 0))
    df.loc[df["loser_team_clean"].isin(mapping.keys()), "loser_conference_index"] = df[df["loser_team_clean"].isin(mapping.keys())]["loser_team_clean"].apply(lambda x: mapping.get(x, 0))


<IPython.core.display.Javascript object>

In [54]:
# Final cleaning

# Clean up date
df["date_mdy"] = df.date.apply(
    lambda x: datetime.strptime(x, "%A, %B %d, %Y").strftime("%m-%d-%y")
)

# Change NaN to "Other" for conference
df["winner_conference"].fillna("Other", inplace=True)
df["loser_conference"].fillna("Other", inplace=True)

# Add conference size
conferences = conferences_df.groupby("conference", as_index=False).count()
conferences = conferences.append({"conference": "Other", "team": 0}, ignore_index=True)
df["winner_conference_size"] == df["winner_conference"].apply(
    lambda x: conferences.loc[x].team
)
df["loser_conference_size"] == df["loser_conference"].apply(
    lambda x: conferences.loc[x].team, axis=1
)
# df["winner_conference_size"] = pd.merge(
#     df,
#     conferences_df.groupby("conference", as_index=False).count(),
#     left_on="winner_conference",
#     right_on="conference",
# )["team"]
# df["loser_conference_size"] = pd.merge(
#     df,
#     conferences_df.groupby("conference", as_index=False).count(),
#     left_on="loser_conference",
#     right_on="conference",
# )["team"]

TypeError: <lambda>() got an unexpected keyword argument 'axis'

<IPython.core.display.Javascript object>

In [None]:
pd.merge()

In [47]:
conferences_df.groupby("conference", as_index=False).count()

Unnamed: 0,conference,team
0,American Athletic Conference,11
1,Atlantic Coast,15
2,Big 12,10
3,Big Ten,14
4,Conference USA,13
5,Independents(FBS),4
6,Mid-American,12
7,Mountain West,12
8,Pac-12,12
9,Southeastern,14


<IPython.core.display.Javascript object>

AttributeError: 'list' object has no attribute 'loc'

<IPython.core.display.Javascript object>

In [45]:
df[df.loser_conference == "Pac-12"]

Unnamed: 0,week,date,winner_team,winner_score,winner_rank,winner_side,winner_primary_color,winner_secondary_color,loser_team,loser_score,...,loser_secondary_color,winner_team_clean,loser_team_clean,winner_conference,loser_conference,winner_conference_index,loser_conference_index,date_mdy,winner_conference_size,loser_conference_size
1,1,"Saturday, August 24, 2019",Hawaii,45,,home,#154734,#103829,Arizona,38,...,#001948,Hawaii,Arizona,Mountain West,Pac-12,5.0,7.0,08-24-19,14.0,15.0
3,1,"Thursday, August 29, 2019",Cincinnati,24,,home,#C8102E,#a00c24,UCLA,14,...,#005ba4,Cincinnati,UCLA,American Athletic Conference,Pac-12,10.0,9.0,08-29-19,14.0,15.0
25,1,"Friday, August 30, 2019",Oklahoma St.,52,,away,#E35205,#b54104,Oregon St.,36,...,#b03604,Oklahoma State,Oregon State,Big 12,Pac-12,9.0,10.0,08-30-19,14.0,15.0
73,1,"Saturday, August 31, 2019",Auburn,27,16.0,away,#0C2340,#091c33,Oregon,21,...,#12322a,Auburn,Oregon,Southeastern,Pac-12,8.0,4.0,08-31-19,14.0,15.0
121,2,"Saturday, September 07, 2019",San Diego St.,23,,away,#A6192E,#841424,UCLA,14,...,#005ba4,San Diego State,UCLA,Mountain West,Pac-12,4.0,9.0,09-07-19,12.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
837,15,"Friday, December 06, 2019",Oregon,37,13.0,home,#173F35,#12322a,Utah,15,...,#940925,Oregon,Utah,Pac-12,Pac-12,4.0,0.0,12-06-19,10.0,
851,17,"Friday, December 18, 2020",Oregon,31,,away,#173F35,#12322a,Southern California,24,...,#7d1b2a,Oregon,USC,Pac-12,Pac-12,4.0,1.0,12-18-20,10.0,
855,17,"Saturday, December 19, 2020",Utah,45,,home,#BA0C2F,#940925,Washington St.,28,...,#7d1b2a,Utah,Washington State,Pac-12,Pac-12,0.0,11.0,12-19-20,10.0,
863,17,"Saturday, December 19, 2020",Stanford,48,,away,#9D2235,#7d1b2a,UCLA,47,...,#005ba4,Stanford,UCLA,Pac-12,Pac-12,8.0,9.0,12-19-20,10.0,


<IPython.core.display.Javascript object>

In [36]:
df.merge(
    conferences_df.groupby("conference", as_index=False).count(),
    left_on="loser_conference",
    right_on="conference",
)[["loser_conference", "team"]]

Unnamed: 0,loser_conference,team
0,Atlantic Coast,15
1,Atlantic Coast,15
2,Atlantic Coast,15
3,Atlantic Coast,15
4,Atlantic Coast,15
...,...,...
730,Big 12,10
731,Big 12,10
732,Big 12,10
733,Big 12,10


<IPython.core.display.Javascript object>

In [16]:
df.to_csv("ncaa_final.csv", index=False)

<IPython.core.display.Javascript object>

## Create JSON Dataset for d3.js

In [17]:
df = pd.read_csv("ncaa_final.csv")

<IPython.core.display.Javascript object>

1      15.0
3      15.0
25     15.0
73     15.0
121    12.0
       ... 
837     NaN
851     NaN
855     NaN
863     NaN
866     NaN
Name: loser_conference_size, Length: 71, dtype: float64

<IPython.core.display.Javascript object>

In [4]:
# Break dataframe into an exhaustive list of teams
columns = [
    "team",
    "primary_color",
    "secondary_color",
    "img_name",
    "conference",
    "conference_index",
]
teams = pd.DataFrame(columns=columns)

for side in ["winner", "loser"]:

    df_side = df[
        [
            f"{side}_team_clean",
            f"{side}_primary_color",
            f"{side}_secondary_color",
            f"{side}_team",
            f"{side}_conference",
            f"{side}_conference_index",
        ]
    ]

    df_side.columns = columns

    teams = teams.append(df_side)

teams.drop_duplicates(inplace=True)

teams.head()

Unnamed: 0,team,primary_color,secondary_color,img_name,conference,conference_index
0,Florida,#003087,#00266c,Florida,Southeastern,8.0
1,Hawaii,#154734,#103829,Hawaii,Mountain West,1.0
2,Buffalo,#0057B8,#004593,Buffalo,Mid-American,4.0
3,Cincinnati,#C8102E,#a00c24,Cincinnati,American Athletic Conference,10.0
4,Bowling Green,#FE5000,#cb4000,Bowling Green,Mid-American,9.0


<IPython.core.display.Javascript object>

In [5]:
# conferences = list(nodes[nodes.conference != 'Other'].conference.unique()) + list(nodes[nodes.conference == "Other"].team.unique())

conferences = list(teams[teams.conference != "Other"].conference.unique())
nonconferences = list(teams[teams.conference == "Other"].team.unique())

nodes = []
for conference in conferences:

    members = teams[teams.conference == conference]

    nodes.append(
        {
            "conference": conference,
            "size": int(members.team.drop_duplicates().count()),
            "teams": members.to_dict(orient="records"),
        }
    )

for team in nonconferences:

    members = teams[teams.team == team]

    nodes.append(
        {
            "conference": team,
            "size": members.shape[0],
            "teams": members.to_dict(orient="records"),
        }
    )

<IPython.core.display.Javascript object>

In [6]:
# Links
links = df[
    [
        "week",
        "date",
        "date_mdy",
        "winner_team",
        "winner_team_clean",
        "winner_score",
        "winner_rank",
        "winner_side",
        "winner_primary_color",
        "winner_secondary_color",
        "winner_conference",
        "winner_conference_index",
        "winner_conference_size",
        "loser_team",
        "loser_team_clean",
        "loser_score",
        "loser_rank",
        "loser_side",
        "loser_primary_color",
        "loser_secondary_color",
        "loser_conference",
        "loser_conference_index",
        "loser_conference_size",
    ]
]
links.head()

Unnamed: 0,week,date,date_mdy,winner_team,winner_team_clean,winner_score,winner_rank,winner_side,winner_primary_color,winner_secondary_color,...,loser_team,loser_team_clean,loser_score,loser_rank,loser_side,loser_primary_color,loser_secondary_color,loser_conference,loser_conference_index,loser_conference_size
0,1,"Saturday, August 24, 2019",08-24-19,Florida,Florida,24,8.0,home,#003087,#00266c,...,Miami (FL),Miami (FL),20,,away,#154734,#103829,Atlantic Coast,1.0,15.0
1,1,"Saturday, August 24, 2019",08-24-19,Hawaii,Hawaii,45,,home,#154734,#103829,...,Arizona,Arizona,38,,away,#00205B,#001948,Pac-12,7.0,15.0
2,1,"Thursday, August 29, 2019",08-29-19,Buffalo,Buffalo,38,,home,#0057B8,#004593,...,Robert Morris,Robert Morris,10,,away,#041E42,#031834,Other,0.0,15.0
3,1,"Thursday, August 29, 2019",08-29-19,Cincinnati,Cincinnati,24,,home,#C8102E,#a00c24,...,UCLA,UCLA,14,,away,#0072CE,#005ba4,Pac-12,9.0,15.0
4,1,"Thursday, August 29, 2019",08-29-19,Bowling Green,Bowling Green,46,,home,#FE5000,#cb4000,...,Morgan St.,Morgan State,3,,away,#FC4C02,#c93c01,Other,0.0,15.0


<IPython.core.display.Javascript object>

In [7]:
# Save JSON
final = {
    "nodes": nodes,
    "links": links.to_dict(orient="records"),
}

with open("ncaa.json", "w") as file:
    simplejson.dump(final, file, ignore_nan=True)

<IPython.core.display.Javascript object>

In [8]:
final

{'nodes': [{'conference': 'Southeastern',
   'size': 14,
   'teams': [{'team': 'Florida',
     'primary_color': '#003087',
     'secondary_color': '#00266c',
     'img_name': 'Florida',
     'conference': 'Southeastern',
     'conference_index': 8.0},
    {'team': 'Texas A&M',
     'primary_color': '#500000',
     'secondary_color': '#400000',
     'img_name': 'Texas A&M',
     'conference': 'Southeastern',
     'conference_index': 5.0},
    {'team': 'Mississippi State',
     'primary_color': '#6F2C3F',
     'secondary_color': '#582332',
     'img_name': 'Mississippi St.',
     'conference': 'Southeastern',
     'conference_index': 3.0},
    {'team': 'Kentucky',
     'primary_color': '#0032A0',
     'secondary_color': '#002880',
     'img_name': 'Kentucky',
     'conference': 'Southeastern',
     'conference_index': 13.0},
    {'team': 'Alabama',
     'primary_color': '#9D2235',
     'secondary_color': '#7d1b2a',
     'img_name': 'Alabama',
     'conference': 'Southeastern',
     'conf

<IPython.core.display.Javascript object>