### Import packages

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import urllib
import csv
import re
from bs4 import NavigableString

import json
import geopandas as gpd
import descartes
import shapely
import matplotlib.pyplot as plt
from shapely.geometry import MultiPoint

### Import user-defined functions from GitHub

In [2]:
from scrape_object import ncaa_scrape
from geog_object import ncaa_geog
from utils import extract_as_df

### Load supporting files

* Dedupe mapping: Consolidates differing school names. For example UNC and N. Carolina and North Carolina will all appear as North Carolina

* ID mapping: Maps (cleaned) schools names to the ID field in the government geography data

* Lat long: Contains the IDs and latitude and longitude of each schools. From the National Center for Education Statistics (NCES) Integrated Postsecondary Education System (IPEDS) hd2020.csv report (link to data here). NCES is part of the Department of Education

In [3]:
# Open dedupe_mapping
# User-defined to remove duplicate schools name (see Save dedupe_mapping.ipynb)
with open('dedupe_mapping.json') as json_file:
    dedupe_mapping = json.load(json_file)
    
# Open id_mapping
# Manually defined to match the NCAA Tournament schools to the NCES ID data
id_mapping = pd.read_csv('to_join_v1.csv')

# Open lat_long
# From NCES data, reducing the fields to only the ID and geographic data
lat_long = pd.read_csv('lat_long.csv', encoding='latin-1')
lat_long = lat_long.rename(columns = {'UNITID' : 'ID'})

### Iterate through years to create final output

For each year, scrape the Wikipedia page using **ncaa_scrape** object. Then, join the geographic data using **ncaa_geog** object. Finally, use the **extract_as_df** function to format the data to be saved as a CSV. Loop through all years (skipping 2020 because there was no tournament) and union all the data.

In [4]:
for yr in range(1985,2022):
    if yr == 2020:
        continue
    try:
        scrape = ncaa_scrape(yr, dedupe_mapping)
        scrape.scrape_all()
        #scrape.validate_counts(raise_error = True, verbose = False)
        geog = ncaa_geog(scrape, id_mapping, lat_long)
        geog.handle_data()
        
        if yr == 1985:
            final_df = extract_as_df(geog, 'all')
            final_df = extract_as_df(geog, 's16', final_df)
            final_df = extract_as_df(geog, 'e8', final_df)
            final_df = extract_as_df(geog, 'f4', final_df)
            
        else:
            final_df = extract_as_df(geog, 'all', final_df)
            final_df = extract_as_df(geog, 's16', final_df)
            final_df = extract_as_df(geog, 'e8', final_df)
            final_df = extract_as_df(geog, 'f4', final_df)
        
    except:
        print(yr)

In [5]:
# Inspect the data
final_df.tail(25)

Unnamed: 0,SCHOOL,ID,LONGITUD,LATITUDE,Year,Round,Is_Center
3481,UCLA,110662,-118.442179,34.07178,2021,Sweet 16,N
3482,Alabama,100751,-87.545978,33.211875,2021,Sweet 16,N
3483,Baylor,223232,-97.121041,31.546872,2021,Sweet 16,N
3484,Villanova,216597,-75.345457,40.039388,2021,Sweet 16,N
3485,Arkansas,106397,-94.176981,36.070009,2021,Sweet 16,N
3486,Oral Roberts,207582,-95.952547,36.049129,2021,Sweet 16,N
3487,Loyola-Chicago,146719,-87.656872,42.000765,2021,Sweet 16,N
3488,Oregon State,209542,-123.274723,44.56395,2021,Sweet 16,N
3489,Houston,225511,-95.343537,29.720393,2021,Sweet 16,N
3490,Syracuse,196413,-76.136975,43.040176,2021,Sweet 16,N


In [6]:
# Save the output as a csv
final_df.to_csv('final_dataset_v02_09_22.csv')