# Phase III: First ML Proof of Concept
## College Admissions Exploration 

- Team
- John Rotondo, Spring Yan, Anne Hu, Evan Li

Each **project group** will submit a single **jupyter notebook** which contains:

1. (3%) The implementation (using NumPy) of your first ML model as a function call to the cleaned data
2. (2%) A discussion of the preliminary results:
   - This may include checking of assumptions, generated plots/tables, measures of fit, or other attributes of the analysis
   - It does not have to be fully correct, but as a proof of concept must demonstrate that the group is close to completing the analysis

In [37]:
from secret import key
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from collections import defaultdict

import time

## Fetching & Cleaning Data 

In [38]:
def get_pages_data(key, student_size):
    """Gets the api response and applies filter on student size, making sure to go through all the pages of data 
    Params:
    - key = api key 
    - student_size = minimum threshold (int)
    Returns:
    A list with the api response stored """
    # Base URL for the College Scorecard API
    base_url = "https://api.data.gov/ed/collegescorecard/v1/schools"
    
    # Parameters for the API call
    params = {
        'api_key': key,
        'per_page': 100,  # Number of records per page (max is usually 100)
        'page': 0,  # Start at the first page
        f'student.size__range': f'{student_size}..'  # Filter for schools with more than 50,000 students
    }
    
    # List to store all school data
    all_schools = []
    
    # Loop through pages until there are no more results
    while True:
        # Increment the page number
        params['page'] += 1
        
        # Make the API request
        response = requests.get(base_url, params=params)
        
        # Check the response status
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break
    
        # Get the JSON data
        data = response.json()
        
        # Check if there are results
        if 'results' in data and len(data['results']) > 0:
            # Append the results to the all_schools list
            all_schools.extend(data['results'])
            print(f"Retrieved {len(data['results'])} records from page {params['page']}")
        else:
            # If there are no more results, break the loop
            print(f"No more records found on page {params['page']}. Stopping.")
            break

        # ***** just added this!
        # Optional: delay to avoid API rate limits
        time.sleep(1)
    
    # Print the total number of schools retrieved
    print(f"Total schools retrieved: {len(all_schools)}")
    return all_schools

In [40]:
# load in the data 
data_dct = get_pages_data(key, 20000)

Retrieved 26 records from page 1
No more records found on page 2. Stopping.
Total schools retrieved: 26


In [41]:
def build_df(data_dct):
    """ Given the json response, gather the data in the colleges_dct and conduct cleaning/filling in missing values for 
        test scores
    """

    # initialize dictionary to store college data
    colleges_dct = defaultdict(list)

    # # gets the average SAT and ACT scores across all schools in the data set for missing values
    # overall_avg_sat = get_all_sat_scores(data_dct)
    # overall_avg_act = get_all_act_scores(data_dct)

    # loop over each "school" in the data_dct
    for i in range(len(data_dct)):
        school_data = data_dct[i]['latest']['school']
        student_data = data_dct[i]['latest']['student']
        admin_data = data_dct[i]['latest']['admissions']

        # append school data
        colleges_dct['name'].append(school_data.get('name'))
        colleges_dct['city'].append(school_data.get('city'))
        colleges_dct['state'].append(school_data.get('state'))

        # append student data
        colleges_dct['size'].append(student_data.get('size'))

        # append admissions data
        colleges_dct['admin_rate'].append(admin_data.get('admission_rate', {}).get('overall'))

        # append test data 
        colleges_dct['avg_sat'].append(admin_data['sat_scores']['average']['overall'])
        colleges_dct['midpoint_act'].append(admin_data['act_scores']['midpoint']['cumulative'])

        # appends test requirement status based on integers given (0-5)
        test_requirement = admin_data.get('test_requirements')
        if test_requirement == 0:
            colleges_dct['test_requirement'].append('Not Required')
        elif test_requirement == 1:
            colleges_dct['test_requirement'].append('Required')
        elif test_requirement == 2:
            colleges_dct['test_requirement'].append('Recommended')
        elif test_requirement == 3:
            colleges_dct['test_requirement'].append('Niether Rec. or Req.')
        elif test_requirement == 4:
            colleges_dct['test_requirement'].append('Not Known')
        else:
            colleges_dct['test_requirement'].append('Considered but not Req.')

        # if a school does not require test scores appends average of data set ACT test scores
        midpoint_act = admin_data.get('act_scores', {}).get('midpoint', {}).get('cumulative')
        # if midpoint_act is None:
        #     colleges_dct['midpoint_ACT'].append(overall_avg_act)
        # else:
        #     colleges_dct['midpoint_ACT'].append(round(midpoint_act))

        # # if a school does not require test scores appends average of data set SAT test scores
        # average_sat = admin_data.get('sat_scores', {}).get('average', {}).get('overall')
        # if average_sat is None:
        #     colleges_dct['avg_SAT'].append(overall_avg_sat)
        # else:
        #     colleges_dct['avg_SAT'].append(round(average_sat))

    return colleges_dct

In [42]:
df = pd.DataFrame(build_df(data_dct))
df.head()

Unnamed: 0,name,city,state,size,admin_rate,avg_sat,midpoint_act,test_requirement
0,The University of Texas at Dallas,Richardson,TX,21586,0.8474,1304.0,,Considered but not Req.
1,The University of Texas at El Paso,El Paso,TX,20123,0.9992,982.0,,Considered but not Req.
2,The University of Texas at San Antonio,San Antonio,TX,29112,0.869,1111.0,22.0,Considered but not Req.
3,Texas Tech University,Lubbock,TX,32346,0.6734,1198.0,26.0,Considered but not Req.
4,Brigham Young University,Provo,UT,31411,0.6667,1376.0,29.0,Considered but not Req.


In [43]:
# --- analyze NaN value count 
nan_count = df.isna().sum()
nan_count

name                 0
city                 0
state                0
size                 0
admin_rate           8
avg_sat             12
midpoint_act        15
test_requirement     0
dtype: int64

In [44]:
# --- if drop the nan values
df.shape

(26, 8)

In [45]:
# --- find unqiue test requirements
set(df['test_requirement'])

# -- now check if each category has a correpsonding test 
df[['admin_rate', 'avg_sat', 'midpoint_act', 'test_requirement']].groupby(by='test_requirement').mean()

Unnamed: 0_level_0,admin_rate,avg_sat,midpoint_act
test_requirement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Considered but not Req.,0.765408,1217.916667,26.555556
Niether Rec. or Req.,0.735975,,
Not Required,,,
Required,0.60275,1243.5,26.5


In [46]:
df[df['test_requirement'] == "Not Required"]

Unnamed: 0,name,city,state,size,admin_rate,avg_sat,midpoint_act,test_requirement
6,Utah Valley University,Orem,UT,26961,,,,Not Required
10,Northern Virginia Community College,Annandale,VA,32211,,,,Not Required
16,NUC University,Bayamon,PR,25359,,,,Not Required
18,Collin County Community College District,McKinney,TX,25111,,,,Not Required
19,Western Governors University,Salt Lake City,UT,112807,,,,Not Required
20,American Public University System,Charles Town,WV,37569,,,,Not Required
23,University of Phoenix-Arizona,Phoenix,AZ,66792,,,,Not Required
25,Purdue University Global,West Lafayette,IN,33339,,,,Not Required


In [47]:
len(set(df['state']))

df[['admin_rate', 'avg_sat', 'midpoint_act', 'state']].groupby(by='state').mean()

Unnamed: 0_level_0,admin_rate,avg_sat,midpoint_act
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AZ,0.6448,,
GA,0.6785,1133.0,23.0
IL,0.4302,1085.0,
IN,0.527,1354.0,30.0
PR,,,
TX,0.84725,1148.75,24.0
UT,0.831767,1266.333333,26.666667
VA,0.843175,1254.0,27.333333
WA,0.6531,,
WI,0.4906,1374.0,29.0


In [31]:
ca_df = df[df['state'] == 'CA']
# print(list(ca_df['name']))

# ---- confirmed the size is over 10000, but doesn't show up when put on the 10000 filter? 
ca_df[ca_df['name'] == 'University of California-San Diego']

Unnamed: 0,name,city,state,size,admin_rate,avg_sat,midpoint_act,test_requirement
22,University of California-San Diego,La Jolla,CA,33092,0.2371,,,Niether Rec. or Req.


In [32]:
# ---- where is the university of Alabama? 
al_df = df[df['state'] == 'AL']
al_df
# al_df[al_df['name'] == '']

Unnamed: 0,name,city,state,size,admin_rate,avg_sat,midpoint_act,test_requirement
1887,Southern Union State Community College,Wadley,AL,3767,,,,Not Required
1988,Strayer University-Alabama,Birmingham,AL,1237,,,,Not Required
1989,Columbia Southern University,Orange Beach,AL,11208,,,,Not Required


In [33]:
drop_df = df.dropna()
# len(set(df['state']))
len(set(drop_df['state']))

47

# ML Model 

In [None]:
# ---- fetch data & clean data here!

# Dicussion of Results