In [1]:
# Import the requests module
import requests
# We don't need everything from bs4, just BeautifulSoup
from bs4 import BeautifulSoup
# Import pandas and assign the standard shorthand name `pd` to it
import pandas as pd
# Import the standard module for regular expressions (`re`)
import re
# Import the standard module `time`
import time

# Start the timer
start_time = time.time()

# URL of the SEC Wikipedia page
url = 'https://en.wikipedia.org/wiki/Southeastern_Conference'

# Use `requests` to 'get' the URL
response = requests.get(url)

# Check if request was successful
if response.status_code == 200:
    # Make the "soup"
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find the heading with id="Members"
    members_heading = soup.find('h3', id='Members')

    # Find the next table element after this heading
    target_table = members_heading.find_next('table')
    
    # Initialize a Python list to store university data
    universities = []
            
    # Skip the header row and process each row in the table body
    rows = target_table.find_all('tr')[1:]  # Skip the header row
            
    for row in rows:
        cells = row.find_all('th')
        if cells:
            # Look for the university name cell with a link
            univ_cell = cells[0]  # First column contains university name
            link = univ_cell.find('a')
            
            if link:
                # Use the `get_text` method to extract the text between the tags
                univ_name = link.get_text(strip=True)
                # Use the `get` method to extract the value of the `href` attribute
                relative_url = link.get('href')
                # Combine the base URL with the relative URL
                univ_url = "https://en.wikipedia.org" + relative_url
                # Make a dictionary of the university name and URL and add it to the list
                universities.append({
                    'University': univ_name,
                    'Wikipedia URL': univ_url
                })

# Create a dataframe with the universities and their URLs
df = pd.DataFrame(universities)
    
# Make a Python dictionary to store individual university dataframes
university_dataframes = {}
    
# Function to clean up text in infobox tables
def clean_text(text):
    # Use regular expressions to remove citations like "[1]" and "[2]"
    text = re.sub(r'\[\d+\]', '', text)
    # Use regular expressions to remove line breaks and extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text
    
# Request each university's page and extract its infobox table
for index, row in df.iterrows():
    univ_name = row['University']
    univ_url = row['Wikipedia URL']
    
    # Create a simplified name for the dataframe variable
    simple_name = re.sub(r'[^a-zA-Z0-9]', '_', univ_name).lower()
    
    print(f"Processing {univ_name}...")
    
    # Send a GET request to the university's Wikipedia page
    univ_response = requests.get(univ_url)
    
    # Add a short delay to play nicely with Wikipedia's servers
    time.sleep(1)
    
    if univ_response.status_code == 200:
        # Make the univ_soup
        univ_soup = BeautifulSoup(univ_response.content, 'html.parser')
        
        # Find the infobox table
        infobox = univ_soup.find('table', class_='infobox vcard')
        
        if infobox:
            # Initialize a list to hold the rows from the infobox
            infobox_data = []
            
            # Process each row in the infobox
            for tr in infobox.find_all('tr'):
                # Check if row has th (header/label) and td (data)
                th = tr.find('th')
                td = tr.find('td')
                
                if th and td:
                    # Clean the text with the function we made above
                    label = clean_text(th.get_text())
                    value = clean_text(td.get_text())
                    
                    # Append the label and value as a dictionary to the infobox_data list
                    infobox_data.append({
                        'Property': label,
                        'Value': value
                    })
            
            # Create a dataframe for this university
            if infobox_data:
                univ_df = pd.DataFrame(infobox_data)
                university_dataframes[simple_name] = univ_df

# Get a list of all properties for each university
university_properties = {}
for univ_name, univ_df in university_dataframes.items():
    university_properties[univ_name] = set(univ_df['Property'].tolist())

# Find the intersection of all sets of properties
all_universities = list(university_properties.keys())
if all_universities:
    common_properties = university_properties[all_universities[0]].copy()
    # Use the `intersection` method to find the common properties
    for univ_name in all_universities[1:]:
        common_properties = common_properties.intersection(university_properties[univ_name])
    
    # Sort the list and display it
    common_properties_list = sorted(list(common_properties))

# Create a new dataframe with universities as rows and common properties as columns
consolidated_data = []

for univ_name, univ_df in university_dataframes.items():
    # Start with the university name
    univ_data = {'University': univ_name}
    
    # Add each common property value
    for prop in common_properties_list:
        property_row = univ_df[univ_df['Property'] == prop]
        if not property_row.empty:
            univ_data[prop] = property_row['Value'].iloc[0]
        else:
            univ_data[prop] = None
    
    consolidated_data.append(univ_data)

# Create the consolidated dataframe
consolidated_df = pd.DataFrame(consolidated_data)

# Clean up university names for display (remove underscores, capitalize words)
consolidated_df['University'] = consolidated_df['University'].apply(
    lambda x: ' '.join(word.capitalize() for word in x.replace('_', ' ').split())
)

# Save the DataFrame to a CSV file (optional)
consolidated_df.to_excel('sec_universities.xlsx', index=False)

# Calculate elapsed time
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Finished processing {len(consolidated_df)} universities in {elapsed_time:.2f} seconds")

Processing University of Alabama...
Processing University of Arkansas...
Processing Auburn University...
Processing University of Florida...
Processing University of Georgia...
Processing University of Kentucky...
Processing Louisiana State University...
Processing University of Mississippi...
Processing Mississippi State University...
Processing University of Missouri...
Processing University of Oklahoma...
Processing University of South Carolina...
Processing University of Tennessee...
Processing University of Texas at Austin...
Processing Texas A&M University...
Processing Vanderbilt University...
Finished processing 16 universities in 27.33 seconds
