# Phase III: First ML Proof of Concept
## College Admissions Exploration 

- Team
- John Rotondo, Spring Yan, Anne Hu, Evan Li

Each **project group** will submit a single **jupyter notebook** which contains:

1. (3%) The implementation (using NumPy) of your first ML model as a function call to the cleaned data
2. (2%) A discussion of the preliminary results:
   - This may include checking of assumptions, generated plots/tables, measures of fit, or other attributes of the analysis
   - It does not have to be fully correct, but as a proof of concept must demonstrate that the group is close to completing the analysis

In [4]:
from secret import key
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from collections import defaultdict

import time

## Fetching & Cleaning Data 

In [27]:
def get_pages_data(key, student_size):
    """ Gets the api response and applies filter on student size, making sure to go through all the pages of data 
    Params:
    - key = api key 
    - student_size = minimum threshold (int)
    Returns:
    A list with the api response stored """
    # Base URL for the College Scorecard API
    base_url = "https://api.data.gov/ed/collegescorecard/v1/schools"
    
    # Parameters for the API call
    params = {
        'api_key': key,
        'per_page': 100,  # Number of records per page (max is usually 100)
        'page': 0,  # Start at the first page
        f'student.size__range': f'{student_size}..',  # Filter for schools with more than 50,000 students
    }
    
    # List to store all school data
    all_schools = []
    
    # Loop through pages until there are no more results
    while True:
        # Increment the page number
        params['page'] += 1
        
        # Make the API request
        response = requests.get(base_url, params=params)
        
        # Check the response status
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break
    
        # Get the JSON data
        data = response.json()
        
        # Check if there are results
        if 'results' in data and len(data['results']) > 0:
            # Append the results to the all_schools list
            all_schools.extend(data['results'])
            print(f"Retrieved {len(data['results'])} records from page {params['page']}")
        else:
            # If there are no more results, break the loop
            print(f"No more records found on page {params['page']}. Stopping.")
            break

        # ***** just added this!
        # Optional: delay to avoid API rate limits
        time.sleep(1)
    
    # Print the total number of schools retrieved
    print(f"Total schools retrieved: {len(all_schools)}")
    return all_schools

In [28]:
# load in the data 
data_dct = get_pages_data(key, 10000)

Retrieved 100 records from page 1
Retrieved 100 records from page 2
Retrieved 17 records from page 3
No more records found on page 4. Stopping.
Total schools retrieved: 217


In [7]:
def build_df(data_dct):
    """ Given the json response, gather the data in the colleges_dct and conduct cleaning/filling in missing values for 
        test scores
    """

    # initialize dictionary to store college data
    colleges_dct = defaultdict(list)

    # # gets the average SAT and ACT scores across all schools in the data set for missing values
    # overall_avg_sat = get_all_sat_scores(data_dct)
    # overall_avg_act = get_all_act_scores(data_dct)

    # loop over each "school" in the data_dct
    for i in range(len(data_dct)):
        school_data = data_dct[i]['latest']['school']
        student_data = data_dct[i]['latest']['student']
        admin_data = data_dct[i]['latest']['admissions']

        # append school data
        colleges_dct['name'].append(school_data.get('name'))
        colleges_dct['city'].append(school_data.get('city'))
        colleges_dct['state'].append(school_data.get('state'))
        colleges_dct['ownership'].append(school_data.get('ownership'))

        # append student data
        colleges_dct['size'].append(student_data.get('size'))

        # append admissions data
        colleges_dct['admin_rate'].append(admin_data.get('admission_rate', {}).get('overall'))

        # append test data 
        colleges_dct['avg_sat'].append(admin_data['sat_scores']['average']['overall'])
        colleges_dct['midpoint_act'].append(admin_data['act_scores']['midpoint']['cumulative'])

        # appends test requirement status based on integers given (0-5)
        test_requirement = admin_data.get('test_requirements')
        if test_requirement == 0:
            colleges_dct['test_requirement'].append('Not Required')
        elif test_requirement == 1:
            colleges_dct['test_requirement'].append('Required')
        elif test_requirement == 2:
            colleges_dct['test_requirement'].append('Recommended')
        elif test_requirement == 3:
            colleges_dct['test_requirement'].append('Niether Rec. or Req.')
        elif test_requirement == 4:
            colleges_dct['test_requirement'].append('Not Known')
        else:
            colleges_dct['test_requirement'].append('Considered but not Req.')

        # if a school does not require test scores appends average of data set ACT test scores
        midpoint_act = admin_data.get('act_scores', {}).get('midpoint', {}).get('cumulative')
        # if midpoint_act is None:
        #     colleges_dct['midpoint_ACT'].append(overall_avg_act)
        # else:
        #     colleges_dct['midpoint_ACT'].append(round(midpoint_act))

        # # if a school does not require test scores appends average of data set SAT test scores
        # average_sat = admin_data.get('sat_scores', {}).get('average', {}).get('overall')
        # if average_sat is None:
        #     colleges_dct['avg_SAT'].append(overall_avg_sat)
        # else:
        #     colleges_dct['avg_SAT'].append(round(average_sat))

    return colleges_dct

In [8]:
df = pd.DataFrame(build_df(data_dct))
df.head()

Unnamed: 0,name,city,state,ownership,size,admin_rate,avg_sat,midpoint_act,test_requirement
0,Indian River State College,Fort Pierce,FL,1,11481,,,,Not Required
1,Keiser University-Ft Lauderdale,Fort Lauderdale,FL,2,16649,0.9653,,,Niether Rec. or Req.
2,Miami Dade College,Miami,FL,1,40538,,,,Not Required
3,University of Miami,Coral Gables,FL,2,12215,0.1894,1409.0,32.0,Considered but not Req.
4,University of North Florida,Jacksonville,FL,1,13866,0.705,1104.0,23.0,Required


In [9]:
# --- analyze NaN value count 
nan_count = df.isna().sum()
nan_count

name                  0
city                  0
state                 0
ownership             0
size                  0
admin_rate           67
avg_sat              94
midpoint_act        107
test_requirement      0
dtype: int64

In [10]:
# --- if drop the nan values
df.shape

(231, 9)

In [11]:
# --- find unqiue test requirements
set(df['test_requirement'])

# -- now check if each category has a correpsonding test 
df[['admin_rate', 'avg_sat', 'midpoint_act', 'test_requirement']].groupby(by='test_requirement').mean()

Unnamed: 0_level_0,admin_rate,avg_sat,midpoint_act
test_requirement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Considered but not Req.,0.726865,1222.330579,26.380952
Niether Rec. or Req.,0.751909,,22.666667
Not Required,,,
Required,0.674512,1202.3125,25.25


In [12]:
df[df['test_requirement'] == "Not Required"]

Unnamed: 0,name,city,state,ownership,size,admin_rate,avg_sat,midpoint_act,test_requirement
0,Indian River State College,Fort Pierce,FL,1,11481,,,,Not Required
2,Miami Dade College,Miami,FL,1,40538,,,,Not Required
5,Palm Beach State College,Lake Worth,FL,1,20378,,,,Not Required
6,St Petersburg College,St. Petersburg,FL,1,18984,,,,Not Required
7,Santa Fe College,Gainesville,FL,1,11038,,,,Not Required
...,...,...,...,...,...,...,...,...,...
221,American Public University System,Charles Town,WV,3,37569,,,,Not Required
222,Columbia Southern University,Orange Beach,AL,3,11208,,,,Not Required
227,University of Phoenix-Arizona,Phoenix,AZ,3,66792,,,,Not Required
229,University of the People,Pasadena,CA,2,16253,,,,Not Required


In [13]:
len(set(df['state']))

df[['admin_rate', 'avg_sat', 'midpoint_act', 'state']].groupby(by='state').mean()

Unnamed: 0_level_0,admin_rate,avg_sat,midpoint_act
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AL,,,
AZ,0.6448,,
CA,0.9519,,
FL,0.60858,1215.5,26.0
GA,0.697033,1168.0,24.714286
HI,0.7277,1113.0,24.0
IA,0.8811,1209.5,25.5
ID,0.89745,1107.0,23.0
IL,0.7046,1232.833333,28.25
IN,0.717375,1256.0,27.0


In [14]:
ca_df = df[df['state'] == 'CA']
# print(list(ca_df['name']))

# ---- confirmed the size is over 10000, but doesn't show up when put on the 10000 filter? 
ca_df[ca_df['name'] == 'University of California-San Diego']

Unnamed: 0,name,city,state,ownership,size,admin_rate,avg_sat,midpoint_act,test_requirement


In [15]:
ma_df = df[df['state'] == 'MA']
ma_df

Unnamed: 0,name,city,state,ownership,size,admin_rate,avg_sat,midpoint_act,test_requirement
51,Boston University,Boston,MA,2,17668,0.1437,1454.0,33.0,Considered but not Req.
52,University of Massachusetts-Lowell,Lowell,MA,1,11985,0.8595,1255.0,29.0,Considered but not Req.
53,University of Massachusetts-Amherst,Amherst,MA,1,24111,0.6352,1376.0,31.0,Considered but not Req.
54,University of Massachusetts-Boston,Boston,MA,1,11749,0.8076,1198.0,27.0,Considered but not Req.
55,Northeastern University,Boston,MA,2,16172,0.068,1505.0,34.0,Considered but not Req.


In [16]:
df[df['name'] == 'Northeastern University']

Unnamed: 0,name,city,state,ownership,size,admin_rate,avg_sat,midpoint_act,test_requirement
55,Northeastern University,Boston,MA,2,16172,0.068,1505.0,34.0,Considered but not Req.


In [17]:
df[df['name'] == 'Harvard University']

Unnamed: 0,name,city,state,ownership,size,admin_rate,avg_sat,midpoint_act,test_requirement


In [18]:
# ---- where is the university of Alabama? 
al_df = df[df['state'] == 'AL']
al_df
# al_df[al_df['name'] == '']

Unnamed: 0,name,city,state,ownership,size,admin_rate,avg_sat,midpoint_act,test_requirement
222,Columbia Southern University,Orange Beach,AL,3,11208,,,,Not Required


In [19]:
drop_df = df.dropna()
# len(set(df['state']))
len(set(drop_df['state']))

36

In [29]:
# --- potential to use ownership as a more granular avg. 

# ----- dropped private, for-profit universities (much smaller amount & have a lot less test data)
df[['admin_rate', 'avg_sat', 'midpoint_act', 'ownership']].groupby(by='ownership').mean()

Unnamed: 0_level_0,admin_rate,avg_sat,midpoint_act
ownership,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.747561,1201.352941,25.550459
2,0.568595,1358.411765,30.466667
3,0.722067,1085.0,


# ML Model 

In [20]:
# ---- fetch data & clean data here!

# Dicussion of Results