In [1]:
# modules for webscraping (and cleaning)
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
from time import sleep
from datetime import date, timedelta # for ease of date formatting

In [2]:
# load api_key
with open('gov_apikey.txt', 'r') as file:
    apikey = file.read()


In [3]:
# base url
url = "http://api.data.gov/ed/collegescorecard/v1/schools?" # goverment data on colleges
base_url = f"{url}api_key={apikey}&fields="
base_url

'http://api.data.gov/ed/collegescorecard/v1/schools?api_key=er1qasEqx8IpUUq8RENegOgoMb0lte7Jf2hr5a86&fields='

In [4]:
# initialize payload and save apikey in header
payload = {}
headers= {
  "apikey": apikey
}

In [5]:
# Dictionary all the desired fields
year = "latest"
fields = {
          # School Category
          "School Name": "school.name",
          "School ID": "id",
          "School State": "school.state",
          "School Ownership": "school.ownership",
          "Full-time Faculty Rate (%)": "school.ft_faculty_rate",
          "Faculty's average salary per month": "school.faculty_salary",
          # Student Category
          "Student Enrollment Size": year + ".student.size",
          "Student Enrollment All": year + ".student.enrollment.all",
          "Male Students (%)": year + ".student.demographics.men",
          "Female Students (%)": year + ".student.demographics.women",
          "Retention Rate 4Yr (%)": year + ".student.retention_rate.four_year.full_time",
          #Cost Category
          "Attendance Cost per Academic Year": year + ".cost.attendance.academic_year",
          # Completion Category
          "150% Completion Rate at 4Yr (%)": year + ".completion.completion_rate_4yr_150nt",
          # Admissions Category 
          "Admission Rate (%)": year + ".admissions.admission_rate.overall",
          "SAT Average Overall": year + ".admissions.sat_scores.average.overall",
          "SAT 75th Percentile Critical Math": year + ".admissions.sat_scores.75th_percentile.math",
          "SAT 75th Percentile Critical Reading": year + ".admissions.sat_scores.75th_percentile.critical_reading",
          "SAT 75th Percentile Critical Writing": year + ".admissions.sat_scores.75th_percentile.writing",
        ## Earnings Category
          # 6 Years after Enrollment:
          "Mean Earnings (6 Yrs after Entry)": year + ".earnings.6_yrs_after_entry.working_not_enrolled.mean_earnings",
          "Mean Male Earnings (6 Yrs after Entry)": year + ".earnings.6_yrs_after_entry.mean_earnings.male_students",
          "Mean Female Earnings (6 Yrs after Entry)": year + ".earnings.6_yrs_after_entry.mean_earnings.female_students",
          "Std. Deviation Earning (6 Yrs after Entry)": year + ".earnings.6_yrs_after_entry.working_not_enrolled.std_dev",
          "Percent of Students Earning >$25K (6 Yrs after Entry)": year + ".earnings.6_yrs_after_entry.percent_greater_than_25000",
          "Low Income Students (6 Yrs after Entry)": year + ".earnings.6_yrs_after_entry.working_not_enrolled.income.lowest_tercile",
          "Medium Income Students (6 Yrs after Entry)": year + ".earnings.6_yrs_after_entry.working_not_enrolled.income.middle_tercile",
          "High Income Students (6 Yrs after Entry)": year + ".earnings.6_yrs_after_entry.working_not_enrolled.income.highest_tercile",
          "Mean Earnings Low (6 Yrs after Entry)": year + ".earnings.6_yrs_after_entry.mean_earnings.lowest_tercile",
          "Mean Earnings Medium (6 Yrs after Entry)": year + ".earnings.6_yrs_after_entry.mean_earnings.middle_tercile",
          "Mean Earnings High (6 Yrs after Entry)": year + ".earnings.6_yrs_after_entry.mean_earnings.highest_tercile",
          # 10 Years after Enrollment:
          "Mean Earnings (10 Yrs after Entry)": year + ".earnings.10_yrs_after_entry.working_not_enrolled.mean_earnings",
          "Mean Male Earnings (10 Yrs after Entry)": year + ".earnings.10_yrs_after_entry.mean_earnings.male_students",
          "Mean Female Earnings (10 Yrs after Entry)": year + ".earnings.10_yrs_after_entry.mean_earnings.female_students",
          "Std. Deviation Earning (10 Yrs after Entry)": year + ".earnings.10_yrs_after_entry.working_not_enrolled.std_dev",
          "Percent of Students Earning >$25K (10 Yrs after Entry)": year + ".earnings.10_yrs_after_entry.percent_greater_than_25000",
          "Low Income Students (10 Yrs after Entry)": year + ".earnings.10_yrs_after_entry.working_not_enrolled.income.lowest_tercile",
          "Medium Income Students (10 Yrs after Entry)": year + ".earnings.10_yrs_after_entry.working_not_enrolled.income.middle_tercile",
          "High Income Students (10 Yrs after Entry)": year + ".earnings.10_yrs_after_entry.working_not_enrolled.income.highest_tercile",
          "Mean Earnings Low (10 Yrs after Entry)": year + ".earnings.10_yrs_after_entry.mean_earnings.lowest_tercile",
          "Mean Earnings Medium (10 Yrs after Entry)": year + ".earnings.10_yrs_after_entry.mean_earnings.middle_tercile",
          "Mean Earnings High (10 Yrs after Entry)": year + ".earnings.10_yrs_after_entry.mean_earnings.highest_tercile"
         }

In [8]:
# Appending all the fields values to construct the fields_url
fields_url = ""
for key, val in fields.items():
    fields_url = fields_url + val + ","

# To remove the extra "," at the end of fields_url
fields_url = fields_url[:-1]
fields_url

'school.name,id,school.state,school.ownership,school.ft_faculty_rate,school.faculty_salary,latest.student.size,latest.student.enrollment.all,latest.student.demographics.men,latest.student.demographics.women,latest.student.retention_rate.four_year.full_time,latest.cost.attendance.academic_year,latest.completion.completion_rate_4yr_150nt,latest.admissions.admission_rate.overall,latest.admissions.sat_scores.average.overall,latest.admissions.sat_scores.75th_percentile.math,latest.admissions.sat_scores.75th_percentile.critical_reading,latest.admissions.sat_scores.75th_percentile.writing,latest.earnings.6_yrs_after_entry.working_not_enrolled.mean_earnings,latest.earnings.6_yrs_after_entry.mean_earnings.male_students,latest.earnings.6_yrs_after_entry.mean_earnings.female_students,latest.earnings.6_yrs_after_entry.working_not_enrolled.std_dev,latest.earnings.6_yrs_after_entry.percent_greater_than_25000,latest.earnings.6_yrs_after_entry.working_not_enrolled.income.lowest_tercile,latest.earnings

In [9]:
params = {
    "api_key":apikey,
    "school.degrees_awarded.predominant":"3",
    "school.operating":"1"
    
}

In [10]:
r = requests.get(url, params=params, headers=headers, data = payload)
print(r.status_code)
print(r.url)
r.ok

200
https://api.data.gov/ed/collegescorecard/v1/schools?api_key=er1qasEqx8IpUUq8RENegOgoMb0lte7Jf2hr5a86&school.degrees_awarded.predominant=3&school.operating=1


True

In [11]:
r.json().keys()

dict_keys(['metadata', 'results'])

In [12]:
max_page_num = r.json()['metadata']['total']//100 + 1

In [13]:
query_url = f'{r.url}&fields={fields_url}&page=0'
print(requests.get(query_url).ok)
response = requests.get(query_url).json

True


In [15]:
# Construct df

college_df = []
per_page = 100

for page_num in range(0,max_page_num):
    query_url = f'{r.url}&fields={fields_url}&page={page_num}&_per_page={per_page}'
    response = requests.get(query_url).json()
    
    for x in range(len(response["results"])):
        result_row = {}
        
        for key, val in fields.items(): 
            try:
                result_row[key] = response["results"][x][val]
            except KeyError:
                print(f"{key} key not found")
                
        college_df.append(result_row)

        
college_df = pd.DataFrame(college_df)
# college_df

In [16]:
# CLEANING
# Update School Ownership to Named Meaning:
# (1: "Public", 2: "Private NonProfit", 3: "Private ForProfit")
college_df.loc[college_df["School Ownership"] == 1, "School Ownership"] = "Public"
college_df.loc[college_df["School Ownership"] == 2, "School Ownership"] = "Private NonProfit"
college_df.loc[college_df["School Ownership"] == 3, "School Ownership"] = "Private ForProfit"

In [17]:
# Change columns with Percent to Percent Form (*100)
print(college_df.columns)
# 4,8,9,10,12,13,21 are percents

# change these columns to percents
college_df.iloc[:,[4,8,9,10,12,13,22]] = college_df.iloc[:,[4,8,9,10,12,13,22]]*100
college_df.iloc[:,[4,8,9,10,12,13,22]]

Index(['School Name', 'School ID', 'School State', 'School Ownership',
       'Full-time Faculty Rate (%)', 'Faculty's average salary per month',
       'Student Enrollment Size', 'Student Enrollment All',
       'Male Students (%)', 'Female Students (%)', 'Retention Rate 4Yr (%)',
       'Attendance Cost per Academic Year', '150% Completion Rate at 4Yr (%)',
       'Admission Rate (%)', 'SAT Average Overall',
       'SAT 75th Percentile Critical Math',
       'SAT 75th Percentile Critical Reading',
       'SAT 75th Percentile Critical Writing',
       'Mean Earnings (6 Yrs after Entry)',
       'Mean Male Earnings (6 Yrs after Entry)',
       'Mean Female Earnings (6 Yrs after Entry)',
       'Std. Deviation Earning (6 Yrs after Entry)',
       'Percent of Students Earning >$25K (6 Yrs after Entry)',
       'Low Income Students (6 Yrs after Entry)',
       'Medium Income Students (6 Yrs after Entry)',
       'High Income Students (6 Yrs after Entry)',
       'Mean Earnings Low (6 Yrs 

Unnamed: 0,Full-time Faculty Rate (%),Male Students (%),Female Students (%),Retention Rate 4Yr (%),150% Completion Rate at 4Yr (%),Admission Rate (%),Percent of Students Earning >$25K (6 Yrs after Entry)
0,99.60,39.78,60.22,54.03,28.66,89.65,45.3
1,76.19,38.16,61.84,86.40,61.17,80.60,66.9
2,67.02,58.91,41.09,81.80,57.14,77.11,68.5
3,67.97,36.05,63.95,62.02,31.77,98.88,39.3
4,77.07,44.17,55.83,87.23,72.14,80.39,69.5
...,...,...,...,...,...,...,...
1984,,10.59,89.41,,,90.91,
1985,82.80,53.67,46.33,86.91,72.81,78.27,
1986,,46.15,53.85,100.00,,100.00,
1987,,0.00,100.00,,,,


In [18]:
# my dataset!
college_df

Unnamed: 0,School Name,School ID,School State,School Ownership,Full-time Faculty Rate (%),Faculty's average salary per month,Student Enrollment Size,Student Enrollment All,Male Students (%),Female Students (%),...,Mean Male Earnings (10 Yrs after Entry),Mean Female Earnings (10 Yrs after Entry),Std. Deviation Earning (10 Yrs after Entry),Percent of Students Earning >$25K (10 Yrs after Entry),Low Income Students (10 Yrs after Entry),Medium Income Students (10 Yrs after Entry),High Income Students (10 Yrs after Entry),Mean Earnings Low (10 Yrs after Entry),Mean Earnings Medium (10 Yrs after Entry),Mean Earnings High (10 Yrs after Entry)
0,Alabama A & M University,100654,AL,Public,99.60,7599.0,5090.0,,39.78,60.22,...,38500.0,32600.0,25400.0,0.599,459.0,337.0,166.0,33000.0,37300.0,39500.0
1,University of Alabama at Birmingham,100663,AL,Public,76.19,11380.0,13549.0,,38.16,61.84,...,57400.0,43200.0,39200.0,0.747,1304.0,911.0,649.0,47000.0,49500.0,49300.0
2,University of Alabama in Huntsville,100706,AL,Public,67.02,9697.0,7825.0,,58.91,41.09,...,58700.0,46000.0,33600.0,0.779,682.0,454.0,378.0,47000.0,55500.0,55100.0
3,Alabama State University,100724,AL,Public,67.97,7194.0,3603.0,,36.05,63.95,...,33000.0,28300.0,21400.0,0.528,1519.0,548.0,151.0,29000.0,32500.0,34300.0
4,The University of Alabama,100751,AL,Public,77.07,10349.0,30610.0,,44.17,55.83,...,59100.0,45100.0,42500.0,0.786,1424.0,1545.0,2024.0,45500.0,51600.0,55800.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1984,Arizona College of Nursing-Phoenix,495457,AZ,Private ForProfit,,,85.0,,10.59,89.41,...,,,,,,,,,,
1985,The Pennsylvania State University,495767,PA,Public,82.80,10822.0,73189.0,,53.67,46.33,...,,,,,,,,,,
1986,Pathways College,495916,CA,Private NonProfit,,,13.0,,46.15,53.85,...,,,,,,,,,,
1987,Provo College-Idaho Falls Campus,496283,ID,Private ForProfit,,4583.0,16.0,,0.00,100.00,...,,,,,,,,,,


In [19]:
# write to csv
college_df.to_csv("./college_df.csv")