In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

# URL of the SEC Wikipedia page
url = 'https://en.wikipedia.org/wiki/Southeastern_Conference'

# Send a GET request to the URL
response = requests.get(url)

# Check if request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the first table (which should be the member schools table)
    table = soup.find('table', class_='wikitable')
    
    # Initialize a list to store university data
    universities = []
    
    # Skip the header row and process each row in the table body
    rows = table.find_all('tr')[1:]  # Skip the header row
    
    for row in rows:
        cells = row.find_all('th')
        if cells:
            # Look for the university name cell with a link
            univ_cell = cells[0]  # First column contains university name
            link = univ_cell.find('a')
            
            if link:
                univ_name = link.get_text(strip=True)
                # Get the relative URL and convert to absolute URL
                univ_url = "https://en.wikipedia.org" + link.get('href')
                universities.append({
                    'University': univ_name,
                    'Wikipedia URL': univ_url
                })
    
    # Create a DataFrame with the universities and their URLs
    df = pd.DataFrame(universities)
    print("SEC Universities DataFrame:")
    print(df)
    print("\n")
    
    # Dictionary to store individual university dataframes
    university_dataframes = {}
    
    # Function to clean up text in infobox tables
    def clean_text(text):
        # Remove citations [1], [2], etc.
        text = re.sub(r'\[\d+\]', '', text)
        # Remove newlines and extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    # Visit each university's page and extract infobox table
    for index, row in df.iterrows():
        univ_name = row['University']
        univ_url = row['Wikipedia URL']
        
        # Create a simplified name for the dataframe variable
        simple_name = re.sub(r'[^a-zA-Z0-9]', '_', univ_name).lower()
        
        print(f"Processing {univ_name}...")
        
        # Send a GET request to the university's Wikipedia page
        univ_response = requests.get(univ_url)
        
        # Add a small delay to avoid overwhelming Wikipedia servers
        time.sleep(1)
        
        if univ_response.status_code == 200:
            univ_soup = BeautifulSoup(univ_response.content, 'html.parser')
            
            # Find the infobox table
            infobox = univ_soup.find('table', class_='infobox vcard')
            
            if infobox:
                # Extract rows from the infobox
                infobox_data = []
                
                # Process each row in the infobox
                for tr in infobox.find_all('tr'):
                    # Check if row has th (header/label) and td (data)
                    th = tr.find('th')
                    td = tr.find('td')
                    
                    if th and td:
                        label = clean_text(th.get_text())
                        value = clean_text(td.get_text())
                        
                        infobox_data.append({
                            'Property': label,
                            'Value': value
                        })
                
                # Create a DataFrame for this university
                if infobox_data:
                    univ_df = pd.DataFrame(infobox_data)
                    university_dataframes[simple_name] = univ_df
                    print(f"Created DataFrame for {univ_name} with {len(univ_df)} entries")
                else:
                    print(f"No infobox data found for {univ_name}")
            else:
                print(f"No infobox found for {univ_name}")
        else:
            print(f"Failed to retrieve {univ_name} page. Status code: {univ_response.status_code}")
    
    # Display the first few entries of each university dataframe
    for univ_name, univ_df in university_dataframes.items():
        print(f"\n{univ_name} DataFrame:")
        print(univ_df.head())
        
else:
    print(f"Failed to retrieve the SEC page. Status code: {response.status_code}")

SEC Universities DataFrame:
                       University  \
0           University of Alabama   
1          University of Arkansas   
2               Auburn University   
3           University of Florida   
4           University of Georgia   
5          University of Kentucky   
6      Louisiana State University   
7       University of Mississippi   
8    Mississippi State University   
9          University of Missouri   
10         University of Oklahoma   
11   University of South Carolina   
12        University of Tennessee   
13  University of Texas at Austin   
14           Texas A&M University   
15          Vanderbilt University   

                                        Wikipedia URL  
0   https://en.wikipedia.org/wiki/University_of_Al...  
1   https://en.wikipedia.org/wiki/University_of_Ar...  
2     https://en.wikipedia.org/wiki/Auburn_University  
3   https://en.wikipedia.org/wiki/University_of_Fl...  
4   https://en.wikipedia.org/wiki/University_of_Ge...  
5   ht

In [10]:
# Print all available university keys in the dictionary
print("Available universities:")
for key in university_dataframes.keys():
    print(f"- {key}")

# Example: Access and display the University of Alabama's DataFrame
# Replace 'university_of_alabama' with any key from the list above
alabama_df = university_dataframes['university_of_alabama']
print("\nUniversity of Alabama DataFrame:")
print(alabama_df)

# To see the entire DataFrame (not just the first few rows)
print("\nFull DataFrame:")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(alabama_df)

# To check the DataFrame's shape (rows, columns)
print(f"\nDataFrame dimensions: {alabama_df.shape}")

# To get a statistical summary of the DataFrame
print("\nDataFrame info:")
print(alabama_df.info())

# To search for specific information in the DataFrame
print("\nSearching for 'Motto' in properties:")
motto_row = alabama_df[alabama_df['Property'].str.contains('Motto', case=False, na=False)]
print(motto_row)

Available universities:
- university_of_alabama
- university_of_arkansas
- auburn_university
- university_of_florida
- university_of_georgia
- university_of_kentucky
- louisiana_state_university
- university_of_mississippi
- mississippi_state_university
- university_of_missouri
- university_of_oklahoma
- university_of_south_carolina
- university_of_tennessee
- university_of_texas_at_austin
- texas_a_m_university
- vanderbilt_university

University of Alabama DataFrame:
                 Property                                              Value
0                    Type                         Public research university
1             Established      December 18, 1820; 204 years ago (1820-12-18)
2      Parent institution                       University of Alabama System
3           Accreditation                                               SACS
4   Academic affiliations                        ORAUURASea-grantSpace-grant
5               Endowment  $1.22 billion (2023)(UA only)$2.09 bi

In [11]:
university_of_oklahoma = university_dataframes['university_of_oklahoma']
university_of_oklahoma

Unnamed: 0,Property,Value
0,Former name,Norman Territorial University (1890–1907)
1,Motto,Latin: Civi et Reipublicae
2,Motto in English,"""For the benefit of the Citizen and the State"""
3,Type,Public research university
4,Established,"December 19, 1890; 134 years ago (December 19,..."
5,Parent institution,Oklahoma State System of Higher Education - Re...
6,Accreditation,HLC
7,Academic affiliations,ORAUURASpace-grant
8,Endowment,$1.81 billion (FY2024)
9,President,Joseph Harroz Jr.


In [12]:
# Get a list of all properties for each university
university_properties = {}
for univ_name, univ_df in university_dataframes.items():
    university_properties[univ_name] = set(univ_df['Property'].tolist())

# Find the intersection of all sets of properties
all_universities = list(university_properties.keys())
if all_universities:
    common_properties = university_properties[all_universities[0]].copy()
    
    for univ_name in all_universities[1:]:
        common_properties = common_properties.intersection(university_properties[univ_name])
    
    # Convert to a sorted list and display
    common_properties_list = sorted(list(common_properties))
    
    print(f"Properties common to all {len(all_universities)} universities:")
    for prop in common_properties_list:
        print(f"- {prop}")
    print(f"\nTotal common properties: {len(common_properties_list)}")
else:
    print("No university data found.")

# To see which properties are most common (but not necessarily in all universities)
all_properties = {}
for univ_name, props in university_properties.items():
    for prop in props:
        if prop in all_properties:
            all_properties[prop] += 1
        else:
            all_properties[prop] = 1

# Sort properties by frequency
most_common_props = sorted(all_properties.items(), key=lambda x: x[1], reverse=True)

print("\nMost common properties (with count):")
for prop, count in most_common_props[:15]:  # Show top 15
    percentage = (count / len(all_universities)) * 100
    print(f"- {prop}: {count}/{len(all_universities)} universities ({percentage:.1f}%)")

Properties common to all 16 universities:
- Campus
- Endowment
- Established
- Location
- Nickname
- Sporting affiliations
- Students
- Type
- Website

Total common properties: 9

Most common properties (with count):
- Campus: 16/16 universities (100.0%)
- Website: 16/16 universities (100.0%)
- Sporting affiliations: 16/16 universities (100.0%)
- Type: 16/16 universities (100.0%)
- Established: 16/16 universities (100.0%)
- Nickname: 16/16 universities (100.0%)
- Students: 16/16 universities (100.0%)
- Endowment: 16/16 universities (100.0%)
- Location: 16/16 universities (100.0%)
- Mascot: 15/16 universities (93.8%)
- Colors: 15/16 universities (93.8%)
- Accreditation: 15/16 universities (93.8%)
- Newspaper: 14/16 universities (87.5%)
- Academic staff: 14/16 universities (87.5%)
- Provost: 14/16 universities (87.5%)


In [13]:
# First, identify the common properties again (or use the ones previously found)
university_properties = {}
for univ_name, univ_df in university_dataframes.items():
    university_properties[univ_name] = set(univ_df['Property'].tolist())

all_universities = list(university_properties.keys())
if all_universities:
    common_properties = university_properties[all_universities[0]].copy()
    
    for univ_name in all_universities[1:]:
        common_properties = common_properties.intersection(university_properties[univ_name])
    
    common_properties_list = sorted(list(common_properties))
else:
    common_properties_list = []
    print("No university data found.")

# Create a new DataFrame with universities as rows and common properties as columns
consolidated_data = []

for univ_name, univ_df in university_dataframes.items():
    # Start with the university name
    univ_data = {'University': univ_name}
    
    # Add each common property value
    for prop in common_properties_list:
        property_row = univ_df[univ_df['Property'] == prop]
        if not property_row.empty:
            univ_data[prop] = property_row['Value'].iloc[0]
        else:
            univ_data[prop] = None
    
    consolidated_data.append(univ_data)

# Create the consolidated DataFrame
consolidated_df = pd.DataFrame(consolidated_data)

# Clean up university names for display (remove underscores, capitalize properly)
consolidated_df['University'] = consolidated_df['University'].apply(
    lambda x: ' '.join(word.capitalize() for word in x.replace('_', ' ').split())
)

# Display the consolidated DataFrame
print("Consolidated University DataFrame:")
print(consolidated_df)

# Save the DataFrame to a CSV file (optional)
# consolidated_df.to_csv('sec_universities.csv', index=False)

# Basic statistics and information about the DataFrame
print(f"\nDataFrame shape: {consolidated_df.shape}")
print(f"Number of universities: {len(consolidated_df)}")
print(f"Number of common properties: {len(common_properties_list)}")

# Display a transposed view for easier reading if there are many columns
if len(common_properties_list) > 10:
    print("\nTransposed view of first few universities (for easier reading):")
    print(consolidated_df.set_index('University').head(3).T)

Consolidated University DataFrame:
                       University  \
0           University Of Alabama   
1          University Of Arkansas   
2               Auburn University   
3           University Of Florida   
4           University Of Georgia   
5          University Of Kentucky   
6      Louisiana State University   
7       University Of Mississippi   
8    Mississippi State University   
9          University Of Missouri   
10         University Of Oklahoma   
11   University Of South Carolina   
12        University Of Tennessee   
13  University Of Texas At Austin   
14           Texas A M University   
15          Vanderbilt University   

                                               Campus  \
0                   Small city, 1,970 acres (8.0 km2)   
1                    Small city, 412 acres (1.67 km2)   
2                  Small City, 1,841 acres (7.45 km2)   
3                  Midsize city, 2,000 acres (810 ha)   
4   Midsize city / College town, 762 acres (3.08 k

In [14]:
consolidated_df

Unnamed: 0,University,Campus,Endowment,Established,Location,Nickname,Sporting affiliations,Students,Type,Website
0,University Of Alabama,"Small city, 1,970 acres (8.0 km2)",$1.22 billion (2023)(UA only)$2.09 billion (20...,"December 18, 1820; 204 years ago (1820-12-18)","Tuscaloosa, Alabama, United States33°12′39″N 8...",Crimson Tide,NCAA Division I FBS – SEC,"39,622 (fall 2023)",Public research university,www.ua.edu
1,University Of Arkansas,"Small city, 412 acres (1.67 km2)",$1.7 billion (FY 2021),"March 27, 1871; 153 years ago (March 27, 1871)","Fayetteville, Arkansas, United States36°04′07″...",Razorbacks,NCAA Division I FBS – SEC,"32,140 (fall 2023)",Public land-grant research university,uark.edu
2,Auburn University,"Small City, 1,841 acres (7.45 km2)",$1.25 billion (2024),"February 7, 1856; 169 years ago (February 7, 1...","Auburn, Alabama, United States",Tigers,NCAA Division I FBS – SEC,34195,Public land-grant research university,auburn.edu
3,University Of Florida,"Midsize city, 2,000 acres (810 ha)",$2.337 billion (2023),"January 6, 1853;172 years ago (1853-01-06)[not...","Gainesville, Florida, United States29°38′51″N ...",Gators,NCAA Division I FBS – SECBig 12,"54,814 (fall 2023)",Public land-grant research university,ufl.edu
4,University Of Georgia,"Midsize city / College town, 762 acres (3.08 k...",$1.82 billion (2023),"January 27, 1785; 240 years ago (1785-01-27)","Athens, Georgia, United States33°57′21″N 83°22...",Bulldogs,NCAA Division I FBS – SEC,"40,607 (fall 2022)",Public flagship land-grant research university,uga.edu
5,University Of Kentucky,"Large City, 784 acres (3.17 km2)",$2.13 billion (2023),"February 22, 1865; 160 years ago (February 22,...","Lexington, Kentucky, United States38°01′57″N 8...",Wildcats,NCAA Division I FBS – SECC-USAGARC,"35,952 (fall 2024)",Public land-grant research university,uky.edu
6,Louisiana State University,"Midsize city, 4,925 acres (1,993 ha)",$664.20 million (2023)(LSU only)$1.06 billion ...,"January 2, 1860; 165 years ago (January 2, 1860)","Baton Rouge, Louisiana, United States30°24′52″...",Tigers and Lady Tigers,NCAA Division I FBS – SECCCSA,"37,354 (fall 2022)",Public land-grant research university,lsu.edu
7,University Of Mississippi,"Remote town, 3,497 acres (14.15 km2)",$962 million (2024),"February 24, 1844; 181 years ago (February 24,...","University, Mississippi, 38677",Rebels,NCAA Division I FBS – SECPRC,"24,710 (for 2023-2024 year)",Public research university,olemiss.edu
8,Mississippi State University,"Remote town, 4,200 acres (17 km2)",$894.5 million (2024),"February 28, 1878; 147 years ago (February 28,...","Mississippi State, Mississippi, United States3...",Bulldogs,NCAA Division I FBS – SEC,"23,150 (fall 2024)",Public land-grant research university,msstate.edu
9,University Of Missouri,"Midsize city, 1,262 acres (511 ha)Total, 19,26...",$1.42 billion (2023)(MU only)$2.24 billion (20...,"February 11, 1839; 186 years ago (1839-02-11)","Columbia, Missouri, United States38°56′43″N 92...",Tigers,NCAA Division I FBS – SECBig 12,"31,543 (fall 2024)",Public land-grant research university,missouri.edu
