Patrick Marshall and Patrick Rafferty

Introduction to Data Science

Final Project Notebook

7 March 2024


# Objective

Out of the features collected by the Department of Education in the "College Scorecard" dataset, which are the most influential on predicting student debt?

# Imports

In [1]:
# Imports
import urllib.request
import zipfile

import pandas as pd
from scipy import stats
from scipy.stats import kruskal
from sklearnex import ensemble
from sklearn import model_selection

# Data Loading

In [2]:
# Retrieve Data Zip File
if not zipfile.is_zipfile("data/CollegeScorecard_Raw_Data_09262023.zip"):
    urllib.request.urlretrieve("https://ed-public-download.app.cloud"
                               ".gov/downloads/CollegeScorecard_Raw_Data_09262023.zip",
                               "data/CollegeScorecard_Raw_Data_09262023.zip")

In [3]:
# Using Pandas' built-in zip support does not work since our zip file has more than one file 
# archived inside.
with (zipfile.ZipFile("data/CollegeScorecard_Raw_Data_09262023.zip").open(
        "data/Most-Recent-Cohorts-Institution.csv") as data_csv):
    college_scorecard_institution = pd.read_csv(data_csv, na_values=("NULL", "PrivacySuppressed"),
                                                true_values=["1"], false_values=["0"],
                                                index_col="UNITID", engine="pyarrow").dropna(
        axis="columns", how="all").convert_dtypes()
college_scorecard_institution.head(10)

Unnamed: 0_level_0,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL,SCH_DEG,...,OMAWDP8_NOPELL_FIRSTTIME,OMENRUP_NOPELL_FIRSTTIME,OMENRYP_NOPELL_NOTFIRSTTIME,OMENRAP_NOPELL_NOTFIRSTTIME,OMAWDP8_NOPELL_NOTFIRSTTIME,OMENRUP_NOPELL_NOTFIRSTTIME,OMACHT8_NOPELL_ALL,OMACHT8_NOPELL_FIRSTTIME,OMACHT8_NOPELL_NOTFIRSTTIME,ADDR
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,100200,1002,Alabama A & M University,Normal,AL,35762,Southern Association of Colleges and Schools C...,www.aamu.edu/,www.aamu.edu/admissions-aid/tuition-fees/net-p...,3,...,0.3187,0.2709,0.0128,0.2949,0.4744,0.2179,329,251,78,4900 Meridian Street
100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,Southern Association of Colleges and Schools C...,https://www.uab.edu/,https://tcc.ruffalonl.com/University of Alabam...,3,...,0.6937,0.066,0.0111,0.2636,0.5136,0.2117,2358,1182,1176,Administration Bldg Suite 1070
100690,2503400,25034,Amridge University,Montgomery,AL,36117-3553,Southern Association of Colleges and Schools C...,https://www.amridgeuniversity.edu/,https://www2.amridgeuniversity.edu:9091/,3,...,0.0,0.5,0.0,0.3333,0.4583,0.2083,26,2,24,1200 Taylor Rd
100706,105500,1055,University of Alabama in Huntsville,Huntsville,AL,35899,Southern Association of Colleges and Schools C...,www.uah.edu/,finaid.uah.edu/,3,...,0.6471,0.0941,0.0082,0.2647,0.5948,0.1324,1122,510,612,301 Sparkman Dr
100724,100500,1005,Alabama State University,Montgomery,AL,36104-0271,Southern Association of Colleges and Schools C...,www.alasu.edu/,www.alasu.edu/cost-aid/tuition-costs/net-price...,3,...,0.4381,0.2167,0.0,0.1444,0.3667,0.4889,510,420,90,915 S Jackson Street
100751,105100,1051,The University of Alabama,Tuscaloosa,AL,35487-0100,Southern Association of Colleges and Schools C...,www.ua.edu/,financialaid.ua.edu/net-price-calculator/,3,...,0.7532,0.0433,0.0096,0.2351,0.6292,0.1261,6774,5315,1459,739 University Blvd
100760,100700,1007,Central Alabama Community College,Alexander City,AL,35010,Southern Association of Colleges and Schools C...,www.cacc.edu/,https://www.cacc.edu/net-price-calculator/,2,...,0.3264,0.5397,0.0,0.1408,0.2958,0.5634,310,239,71,1675 Cherokee Rd
100812,100800,1008,Athens State University,Athens,AL,35611,Southern Association of Colleges and Schools C...,www.athens.edu/,www.athens.edu/financial-aid/net-price-calcula...,3,...,,,0.0133,0.1433,0.6219,0.2216,677,0,677,300 N Beaty St
100830,831000,8310,Auburn University at Montgomery,Montgomery,AL,36117-3596,Southern Association of Colleges and Schools C...,www.aum.edu/,www.aum.edu/current-students/financial-informa...,3,...,0.4058,0.3037,0.0114,0.2576,0.4962,0.2348,646,382,264,7440 East Drive
100858,100900,1009,Auburn University,Auburn,AL,36849,Southern Association of Colleges and Schools C...,www.auburn.edu/,https://www.auburn.edu/admissions/costcalc/fre...,3,...,0.8171,0.044,0.0055,0.1953,0.7347,0.0644,4154,3253,901,"Auburn, Alabama"


In [4]:
# Using Pandas' built-in zip support does not work since our zip file has more than one file 
# archived inside.
with (zipfile.ZipFile("data/CollegeScorecard_Raw_Data_09262023.zip").open(
        "data/Most-Recent-Cohorts-Field-of-Study.csv") as data_csv):
    college_scorecard_field_of_study = pd.read_csv(data_csv,
                                                   na_values=("NULL", "PrivacySuppressed"),
                                                   true_values=["1"], false_values=["0"],
                                                   engine="pyarrow").dropna(axis="columns",
                                                                            how="all").convert_dtypes()
college_scorecard_field_of_study.head(10)

Unnamed: 0,UNITID,OPEID6,INSTNM,CONTROL,MAIN,CIPCODE,CIPDESC,CREDLEV,CREDDESC,IPEDSCOUNT1,...,BBRR4_FED_COMP_N,BBRR4_FED_COMP_DFLT,BBRR4_FED_COMP_DLNQ,BBRR4_FED_COMP_FBR,BBRR4_FED_COMP_DFR,BBRR4_FED_COMP_NOPROG,BBRR4_FED_COMP_MAKEPROG,BBRR4_FED_COMP_PAIDINFULL,BBRR4_FED_COMP_DISCHARGE,DISTANCE
0,100654,1002,Alabama A & M University,Public,1,100,"Agriculture, General.",3,Bachelor's Degree,,...,,,,,,,,,,0
1,100654,1002,Alabama A & M University,Public,1,101,Agricultural Business and Management.,3,Bachelor's Degree,,...,,,,,,,,,,0
2,100654,1002,Alabama A & M University,Public,1,109,Animal Sciences.,3,Bachelor's Degree,6.0,...,,,,,,,,,,1
3,100654,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,3,Bachelor's Degree,7.0,...,,,,,,,,,,1
4,100654,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,5,Master's Degree,8.0,...,12.0,,,,,,,,,1
5,100654,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,6,Doctoral Degree,2.0,...,,,,,,,,,,1
6,100654,1002,Alabama A & M University,Public,1,111,Plant Sciences.,3,Bachelor's Degree,,...,,,,,,,,,,0
7,100654,1002,Alabama A & M University,Public,1,199,"Agriculture, Agriculture Operations, and Relat...",3,Bachelor's Degree,2.0,...,,,,,,,,,,1
8,100654,1002,Alabama A & M University,Public,1,199,"Agriculture, Agriculture Operations, and Relat...",5,Master's Degree,2.0,...,,,,,,,,,,1
9,100654,1002,Alabama A & M University,Public,1,199,"Agriculture, Agriculture Operations, and Relat...",6,Doctoral Degree,2.0,...,,,,,,,,,,1


In [5]:
joined_college_scorecard = (
    college_scorecard_field_of_study.join(other=college_scorecard_institution, on="UNITID",
                                          how="left", lsuffix="_field_of_study",
                                          rsuffix="_institution", validate="many_to_one"))
del college_scorecard_field_of_study
del college_scorecard_institution
joined_college_scorecard

Unnamed: 0,UNITID,OPEID6_field_of_study,INSTNM_field_of_study,CONTROL_field_of_study,MAIN_field_of_study,CIPCODE,CIPDESC,CREDLEV,CREDDESC,IPEDSCOUNT1,...,OMAWDP8_NOPELL_FIRSTTIME,OMENRUP_NOPELL_FIRSTTIME,OMENRYP_NOPELL_NOTFIRSTTIME,OMENRAP_NOPELL_NOTFIRSTTIME,OMAWDP8_NOPELL_NOTFIRSTTIME,OMENRUP_NOPELL_NOTFIRSTTIME,OMACHT8_NOPELL_ALL,OMACHT8_NOPELL_FIRSTTIME,OMACHT8_NOPELL_NOTFIRSTTIME,ADDR
0,100654,1002,Alabama A & M University,Public,1,100,"Agriculture, General.",3,Bachelor's Degree,,...,0.3187,0.2709,0.0128,0.2949,0.4744,0.2179,329,251,78,4900 Meridian Street
1,100654,1002,Alabama A & M University,Public,1,101,Agricultural Business and Management.,3,Bachelor's Degree,,...,0.3187,0.2709,0.0128,0.2949,0.4744,0.2179,329,251,78,4900 Meridian Street
2,100654,1002,Alabama A & M University,Public,1,109,Animal Sciences.,3,Bachelor's Degree,6,...,0.3187,0.2709,0.0128,0.2949,0.4744,0.2179,329,251,78,4900 Meridian Street
3,100654,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,3,Bachelor's Degree,7,...,0.3187,0.2709,0.0128,0.2949,0.4744,0.2179,329,251,78,4900 Meridian Street
4,100654,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,5,Master's Degree,8,...,0.3187,0.2709,0.0128,0.2949,0.4744,0.2179,329,251,78,4900 Meridian Street
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233974,,42640,California Southern University,"Private, for-profit",1,4228,"Clinical, Counseling and Applied Psychology.",7,First Professional Degree,,...,,,,,,,,,,
233975,,42640,California Southern University,"Private, for-profit",1,5138,"Registered Nursing, Nursing Administration, Nu...",3,Bachelor's Degree,,...,,,,,,,,,,
233976,,42924,Body Wisdom Massage Therapy School,"Private, for-profit",1,5135,Somatic Bodywork and Related Therapeutic Servi...,1,Undergraduate Certificate or Diploma,,...,,,,,,,,,,
233977,,42961,Madera Community College,Public,1,2401,"Liberal Arts and Sciences, General Studies and...",2,Associate's Degree,,...,,,,,,,,,,


We will convert any boolean variables to have a boolean dtype.

In [6]:
for col in joined_college_scorecard.columns:
    unique_cols = joined_college_scorecard[col].unique()
    if len(unique_cols) <= 3 and 0 in unique_cols and 1 in unique_cols:
        #print(f"Converting {col} to boolean")
        joined_college_scorecard[col] = joined_college_scorecard[col].astype("boolean")

joined_college_scorecard

Unnamed: 0,UNITID,OPEID6_field_of_study,INSTNM_field_of_study,CONTROL_field_of_study,MAIN_field_of_study,CIPCODE,CIPDESC,CREDLEV,CREDDESC,IPEDSCOUNT1,...,OMAWDP8_NOPELL_FIRSTTIME,OMENRUP_NOPELL_FIRSTTIME,OMENRYP_NOPELL_NOTFIRSTTIME,OMENRAP_NOPELL_NOTFIRSTTIME,OMAWDP8_NOPELL_NOTFIRSTTIME,OMENRUP_NOPELL_NOTFIRSTTIME,OMACHT8_NOPELL_ALL,OMACHT8_NOPELL_FIRSTTIME,OMACHT8_NOPELL_NOTFIRSTTIME,ADDR
0,100654,1002,Alabama A & M University,Public,True,100,"Agriculture, General.",3,Bachelor's Degree,,...,0.3187,0.2709,0.0128,0.2949,0.4744,0.2179,329,251,78,4900 Meridian Street
1,100654,1002,Alabama A & M University,Public,True,101,Agricultural Business and Management.,3,Bachelor's Degree,,...,0.3187,0.2709,0.0128,0.2949,0.4744,0.2179,329,251,78,4900 Meridian Street
2,100654,1002,Alabama A & M University,Public,True,109,Animal Sciences.,3,Bachelor's Degree,6,...,0.3187,0.2709,0.0128,0.2949,0.4744,0.2179,329,251,78,4900 Meridian Street
3,100654,1002,Alabama A & M University,Public,True,110,Food Science and Technology.,3,Bachelor's Degree,7,...,0.3187,0.2709,0.0128,0.2949,0.4744,0.2179,329,251,78,4900 Meridian Street
4,100654,1002,Alabama A & M University,Public,True,110,Food Science and Technology.,5,Master's Degree,8,...,0.3187,0.2709,0.0128,0.2949,0.4744,0.2179,329,251,78,4900 Meridian Street
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233974,,42640,California Southern University,"Private, for-profit",True,4228,"Clinical, Counseling and Applied Psychology.",7,First Professional Degree,,...,,,,,,,,,,
233975,,42640,California Southern University,"Private, for-profit",True,5138,"Registered Nursing, Nursing Administration, Nu...",3,Bachelor's Degree,,...,,,,,,,,,,
233976,,42924,Body Wisdom Massage Therapy School,"Private, for-profit",True,5135,Somatic Bodywork and Related Therapeutic Servi...,1,Undergraduate Certificate or Diploma,,...,,,,,,,,,,
233977,,42961,Madera Community College,Public,True,2401,"Liberal Arts and Sciences, General Studies and...",2,Associate's Degree,,...,,,,,,,,,,


We will pull out the data from MSOE so it can be used for testing later without target leakage.

In [7]:
if "msoe_data" not in globals():
    msoe_data = joined_college_scorecard[
        joined_college_scorecard["INSTNM_field_of_study"] == "Milwaukee School of Engineering"]
joined_college_scorecard.drop(index=joined_college_scorecard[
    joined_college_scorecard["INSTNM_field_of_study"] == "Milwaukee School of Engineering"].index,
                              inplace=True)
msoe_data

Unnamed: 0,UNITID,OPEID6_field_of_study,INSTNM_field_of_study,CONTROL_field_of_study,MAIN_field_of_study,CIPCODE,CIPDESC,CREDLEV,CREDDESC,IPEDSCOUNT1,...,OMAWDP8_NOPELL_FIRSTTIME,OMENRUP_NOPELL_FIRSTTIME,OMENRYP_NOPELL_NOTFIRSTTIME,OMENRAP_NOPELL_NOTFIRSTTIME,OMAWDP8_NOPELL_NOTFIRSTTIME,OMENRUP_NOPELL_NOTFIRSTTIME,OMACHT8_NOPELL_ALL,OMACHT8_NOPELL_FIRSTTIME,OMACHT8_NOPELL_NOTFIRSTTIME,ADDR
192794,239318,3868,Milwaukee School of Engineering,"Private, nonprofit",True,909,"Public Relations, Advertising, and Applied Com...",3,Bachelor's Degree,1.0,...,0.6599,0.1059,0.0,0.097,0.6667,0.2364,609,444,165,1025 N Broadway
192795,239318,3868,Milwaukee School of Engineering,"Private, nonprofit",True,1107,Computer Science.,3,Bachelor's Degree,,...,0.6599,0.1059,0.0,0.097,0.6667,0.2364,609,444,165,1025 N Broadway
192796,239318,3868,Milwaukee School of Engineering,"Private, nonprofit",True,1108,Computer Software and Media Applications.,1,Undergraduate Certificate or Diploma,1.0,...,0.6599,0.1059,0.0,0.097,0.6667,0.2364,609,444,165,1025 N Broadway
192797,239318,3868,Milwaukee School of Engineering,"Private, nonprofit",True,1401,"Engineering, General.",3,Bachelor's Degree,2.0,...,0.6599,0.1059,0.0,0.097,0.6667,0.2364,609,444,165,1025 N Broadway
192798,239318,3868,Milwaukee School of Engineering,"Private, nonprofit",True,1401,"Engineering, General.",5,Master's Degree,9.0,...,0.6599,0.1059,0.0,0.097,0.6667,0.2364,609,444,165,1025 N Broadway
192799,239318,3868,Milwaukee School of Engineering,"Private, nonprofit",True,1404,Architectural Engineering.,3,Bachelor's Degree,23.0,...,0.6599,0.1059,0.0,0.097,0.6667,0.2364,609,444,165,1025 N Broadway
192800,239318,3868,Milwaukee School of Engineering,"Private, nonprofit",True,1404,Architectural Engineering.,5,Master's Degree,8.0,...,0.6599,0.1059,0.0,0.097,0.6667,0.2364,609,444,165,1025 N Broadway
192801,239318,3868,Milwaukee School of Engineering,"Private, nonprofit",True,1405,Biomedical/Medical Engineering.,3,Bachelor's Degree,44.0,...,0.6599,0.1059,0.0,0.097,0.6667,0.2364,609,444,165,1025 N Broadway
192802,239318,3868,Milwaukee School of Engineering,"Private, nonprofit",True,1407,Chemical Engineering.,3,Bachelor's Degree,18.0,...,0.6599,0.1059,0.0,0.097,0.6667,0.2364,609,444,165,1025 N Broadway
192803,239318,3868,Milwaukee School of Engineering,"Private, nonprofit",True,1408,Civil Engineering.,3,Bachelor's Degree,16.0,...,0.6599,0.1059,0.0,0.097,0.6667,0.2364,609,444,165,1025 N Broadway


We will also do the same for two neighboring universities.

In [8]:
if "marquette_data" not in globals():
    marquette_data = joined_college_scorecard[
        joined_college_scorecard["INSTNM_field_of_study"] == "Marquette University"]
joined_college_scorecard.drop(index=joined_college_scorecard[
    joined_college_scorecard["INSTNM_field_of_study"] == "Marquette University"].index,
                              inplace=True)

In [9]:
if "uwm_data" not in globals():
    uwm_data = joined_college_scorecard[
        joined_college_scorecard["INSTNM_field_of_study"] == "University of Wisconsin-Milwaukee"]
joined_college_scorecard.drop(index=joined_college_scorecard[
    joined_college_scorecard["INSTNM_field_of_study"] == "University of Wisconsin-Milwaukee"].index,
                              inplace=True)

# Debt Column Key
The dataset has many columns. This section serves as a key to the different ways debt is recorded in the dataset.

Format: `DEBT_[Disaggregation]_[Loan Type]_[Method]_[Metric]`
## Disaggregation
- MALE: Men only
- NOTMALE: Women only
- PELL: People who got Pell grants.
- NOPELL: People who did not receive Pell grants.
- ALL: All graduates, no disaggregation
- STGP: Stafford and Graduate Plus loans
- PP: Parent Plus loans
## Method
- EVAL: Debts originated only at the specified institution.
- ANY: Debts originated at any institution attended by graduates.
## Metric
- N: number of grads with loans
- MDN: Median loan amount dispersed.
- MEAN: Mean loan amount dispersed.

In [10]:
pd.DataFrame(joined_college_scorecard[
                 filter(lambda label: "DEBT" in label, joined_college_scorecard.columns)].columns)

Unnamed: 0,0
0,DEBT_ALL_STGP_ANY_N
1,DEBT_ALL_STGP_ANY_MEAN
2,DEBT_ALL_STGP_ANY_MDN
3,DEBT_ALL_STGP_EVAL_N
4,DEBT_ALL_STGP_EVAL_MEAN
...,...
146,PLUS_DEBT_ALL_STAFFANY_MD
147,PLUS_DEBT_INST_NOSTAFFANY_N
148,PLUS_DEBT_INST_NOSTAFFANY_MD
149,PLUS_DEBT_ALL_NOSTAFFANY_N


# Hypothesis Testing

Before attempting to make a machine learning model, we will check our data to make sure that there is correlations between the various numerical and categorical variables, and the most broad form of student debt recorded.

In [11]:
pd.DataFrame(joined_college_scorecard.dtypes.unique())

Unnamed: 0,0
0,Int64
1,string[python]
2,boolean
3,Float64


Our first test will check for linear relationships between the numerical variables in the dataset and the target debt column we selected.

In [12]:
numeric_variables = list(filter(lambda
                                    col: not "DEBT" in col and not "BRR" in col and not "CDR" in col and not "LP" in col and not "OPEID" in col and not "UNITID" in col and not "LOAN" in col,
                                joined_college_scorecard.select_dtypes(
                                    include=("Int64", "Float64")).columns))

dat = {}

for column in numeric_variables:
    nonna_subset = joined_college_scorecard[[column, "DEBT_ALL_STGP_ANY_MDN"]].dropna()
    if len(nonna_subset >= 2) and nonna_subset[column].nunique() > 1:
        res = stats.pearsonr(nonna_subset["DEBT_ALL_STGP_ANY_MDN"], nonna_subset[column])
        if res[1] < 0.05:
            dat[column] = {"R": res[0], "p-value": res[1]}

df = pd.DataFrame.from_dict(dat, orient='index')
df.reset_index(inplace=True)
df.rename(columns={'index': 'Column', 'R': 'Statistic'}, inplace=True)
df

Unnamed: 0,Column,Statistic,p-value
0,CREDLEV,0.560530,0.000000e+00
1,IPEDSCOUNT1,-0.140444,2.625938e-169
2,IPEDSCOUNT2,-0.129023,4.772835e-140
3,EARN_COUNT_NWNE_HI_1YR,-0.016052,1.424259e-03
4,EARN_CNTOVER150_HI_1YR,0.116071,1.032957e-90
...,...,...,...
1744,OMAWDP8_NOPELL_NOTFIRSTTIME,0.068138,5.791360e-38
1745,OMENRUP_NOPELL_NOTFIRSTTIME,-0.027757,1.588131e-07
1746,OMACHT8_NOPELL_ALL,-0.107071,3.972342e-92
1747,OMACHT8_NOPELL_FIRSTTIME,-0.118420,2.110588e-112


For the categorical variables, we will use ANOVA to compare them with debt.

In [13]:
# Identify categorical variables
categorical_vars = joined_college_scorecard.select_dtypes(include=['category', 'boolean']).columns

# Drop missing values from relevant columns
joined_college_scorecard_clean = joined_college_scorecard.dropna(
    subset=['DEBT_ALL_STGP_ANY_MDN'] + list(categorical_vars))

# Initialize an empty list to store p-values
p_values = []

# Perform Kruskal-Wallis test for each categorical variable
for var in categorical_vars:
    unique_values = joined_college_scorecard_clean[var].unique()
    if len(unique_values) < 2:
        # Skip variables with only one unique value
        continue

    samples_by_group = []
    for val in unique_values:
        mask = joined_college_scorecard_clean[var] == val
        samples_by_group.append(joined_college_scorecard_clean['DEBT_ALL_STGP_ANY_MDN'][mask])

    stat, p = kruskal(*samples_by_group)
    if p < 0.05:
        p_values.append((var, p))

# Convert the list of tuples to a DataFrame for better visualization
p_values_df = pd.DataFrame(p_values, columns=['Variable', 'p-value'])

p_values_df

Unnamed: 0,Variable,p-value
0,HCM2,4.1463859999999996e-20
1,HBCU,9.143728e-218
2,ANNHI,0.005869762
3,AANAPII,2.4784600000000002e-60
4,HSI,4.5601830000000005e-222
5,NANTI,1.397334e-08
6,MENONLY,6.055642e-07
7,WOMENONLY,1.7327589999999999e-19
8,CIP01CERT4,2.726374e-13
9,CIP04CERT1,7.487358e-06


# Machine Learning

This attempts to train a model to see if random forest will work.

In [14]:
temp_subset = joined_college_scorecard.dropna(subset=["DEBT_ALL_STGP_ANY_MDN"])
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    temp_subset[[*numeric_variables, *categorical_vars]], temp_subset["DEBT_ALL_STGP_ANY_MDN"],
    random_state=42)

model = ensemble.RandomForestRegressor(n_jobs=-1, oob_score=True, random_state=42)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.6613164386754449

In [15]:
feature_importances = pd.Series(model.feature_importances_,
                                index=[*numeric_variables, *categorical_vars]).sort_values(
    ascending=False)
feature_importances

CREDLEV                            0.302561
AGE_ENTRY                          0.114891
HI_INC_RPY_1YR_RT                  0.047421
TUITIONFEE_IN                      0.036420
EARN_NOMALE_WNE_MDN_1YR            0.036115
                                     ...   
POOLYRS200                         0.000000
FEMALE_UNKN_4YR_TRANS_YR6_RT       0.000000
CIP16CERT2                         0.000000
FEMALE_UNKN_ORIG_YR4_RT            0.000000
NOT1STGEN_ENRL_2YR_TRANS_YR6_RT    0.000000
Length: 1935, dtype: float64

Attempts prediction with data from MSOE. Expected values are at https://collegescorecard.ed.gov/school/?239318-Milwaukee-School-of-Engineering

In [16]:
msoe_predictions = model.predict(msoe_data[[*numeric_variables, *categorical_vars]])
msoe_predictions

array([27000., 40375., 15000., 27000., 27000., 27000., 28000., 27000.,
       27000., 27000., 27000., 31000., 27639., 27000., 27639., 26054.,
       30697., 27000., 27000., 28000., 30697., 27000., 27000., 27639.,
       27000., 28000., 30697., 27000., 19338., 28000., 28000., 27250.,
       27250., 28000., 27000., 28000., 27250., 28000., 27250., 27250.])

Attempts prediction with data from Marquette. Expected values are at https://collegescorecard.ed.gov/school/?239105-Marquette-University

In [17]:
marquette_predictions = model.predict(marquette_data[[*numeric_variables, *categorical_vars]])
marquette_predictions

array([25028., 25028., 25028., 26884., 25074., 27000., 26884., 26000.,
       23250., 22903., 26694., 25074., 22903., 22903., 22903., 25028.,
       20000., 25074., 20000., 25074., 26000., 22250., 26664., 22250.,
       28818., 26843., 25074., 25074., 27334., 25074., 25074., 28000.,
       26664., 25074., 25074., 28818., 28818., 22250., 27334., 22250.,
       22250., 27000., 22250., 25074., 28818., 23250., 28818., 20792.,
       22250., 28818., 25074., 25074., 26137., 28818., 26000., 25074.,
       25074., 27000., 27000., 27000., 25074., 22250., 26949., 27000.,
       27000., 22250., 22250., 27000., 22250., 25074., 27000., 22250.,
       22250., 22250., 22250., 27000., 22250., 27000., 25074., 22250.,
       25074., 28281., 22250., 22250., 27000., 22250., 27000., 27000.,
       27000., 27000., 27000., 27000., 26000., 22250., 22250., 22250.,
       27000., 27000., 26500., 27000., 27000., 27000., 27000., 22250.,
       27000., 22250., 27000., 22250., 24250., 27000., 27000., 27000.,
      

Attempts prediction with data from UWM. Expected values are at https://collegescorecard.ed.gov/school/?240453-University-of-Wisconsin-Milwaukee

In [18]:
uwm_predictions = model.predict(uwm_data[[*numeric_variables, *categorical_vars]])
uwm_predictions

array([19125., 22746., 25000., 20500., 22641., 22959., 20500., 22673.,
       22527., 19966., 22527., 22527., 22673., 20500., 22673., 26500.,
       20500., 20500., 20500., 33518., 22527., 20500., 20500., 22527.,
       22527., 22527., 22527., 22959., 20500., 20500., 17922., 25000.,
       25000., 17922., 23667., 24201., 31000., 22527., 22527., 27000.,
       23405., 24701., 25000., 20500., 19966., 17922., 20500., 24617.,
       20500., 20500., 27816., 20500., 17922., 23667., 20500., 22527.,
       25499., 22527., 28537., 27000., 19966., 17922., 27000., 22527.,
       20500., 27000., 25499., 20500., 24617., 20500., 20500., 20500.,
       23667., 17922., 20500., 20500., 24875., 20500., 17922., 22527.,
       22527., 20500., 17032., 19500., 22527., 22851., 17032., 26642.,
       12250., 24938., 22527., 20500., 24250., 22527., 17032., 26000.,
       25000., 20500., 17032., 23183., 20500., 22527., 22527., 20500.,
       22527., 20500., 20500., 17032., 22527., 20500., 22527., 17032.,
      

We finish off by exporting the model to a pickle.

In [19]:
import pickle

with open("data/model.pickle", "wb") as f:
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)