In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 

In [6]:
# data obtained from Illinois' Division of Management Information: https://dmi.illinois.edu/stuenr/
fall_enroll_race = pd.read_excel("fall_enroll_by_race.xlsx")
fall_enroll_race.head()

Unnamed: 0,Fall Term,Level,Total,Men,Women,Unknown,Caucasian,Asian American,African American,Hispanic,Native American,Hawaiian/Pacific Isl,Multiracial,International,Unknown.1,All African American,All Native American,All Hawaiian/ Pac Isl,All Asian
0,,,,,,,,,,,,,,,,,,,
1,2023.0,***Campus total***,56403.0,30233.0,26012.0,158.0,20660.0,10582.0,2805.0,6197.0,25.0,25.0,1778.0,12541.0,1790.0,3576.0,590.0,178.0,12121.0
2,2023.0,Undergraduate,35467.0,18715.0,16722.0,30.0,13602.0,7994.0,1954.0,4838.0,10.0,13.0,1361.0,5204.0,491.0,2506.0,407.0,124.0,9215.0
3,2023.0,Graduate,19583.0,11051.0,8441.0,91.0,6263.0,2403.0,803.0,1226.0,14.0,11.0,369.0,7257.0,1237.0,1012.0,172.0,45.0,2680.0
4,2023.0,Professional,1353.0,467.0,849.0,37.0,795.0,185.0,48.0,133.0,1.0,1.0,48.0,80.0,62.0,58.0,11.0,9.0,226.0


In [9]:
# student enrollment for campus
ethnicities = ['Caucasian', 'Asian American', 'African American', 'Hispanic', 'Native American', 'Hawaiian/Pacific Isl']
campus_total = fall_enroll_race[fall_enroll_race['Level'] == '***Campus total*** ']
campus_total.head()

Unnamed: 0,Fall Term,Level,Total,Men,Women,Unknown,Caucasian,Asian American,African American,Hispanic,Native American,Hawaiian/Pacific Isl,Multiracial,International,Unknown.1,All African American,All Native American,All Hawaiian/ Pac Isl,All Asian
1,2023.0,***Campus total***,56403.0,30233.0,26012.0,158.0,20660.0,10582.0,2805.0,6197.0,25.0,25.0,1778.0,12541.0,1790.0,3576.0,590.0,178.0,12121.0
6,2022.0,***Campus total***,56644.0,30705.0,25831.0,108.0,21200.0,10095.0,2943.0,6128.0,25.0,29.0,1640.0,12736.0,1848.0,3686.0,572.0,184.0,11482.0
11,2021.0,***Campus total***,56257.0,30619.0,25537.0,101.0,22040.0,9906.0,3013.0,6275.0,25.0,32.0,1671.0,11474.0,1821.0,3833.0,548.0,227.0,11291.0
16,2020.0,***Campus total***,52331.0,28381.0,23892.0,58.0,21627.0,8579.0,3003.0,5911.0,18.0,25.0,1554.0,9363.0,2251.0,3876.0,567.0,219.0,10986.0
21,2019.0,***Campus total***,51196.0,27582.0,23574.0,40.0,21138.0,7894.0,2815.0,5324.0,19.0,28.0,1464.0,10809.0,1705.0,3569.0,487.0,202.0,9928.0


In [11]:
# Testing Hyperparameter: Linear Regression + Lasso
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

mae_trn = []
mae_val = []

for ethnicity in ethnicities:
    year = []
    students = []
    for row in campus_total.iterrows():
        year.append(int(row[1]['Fall Term']))
        students.append(row[1][ethnicity])
    if (ethnicity == 'Hawaiian/Pacific Isl'): # no data for 2004 - 2009
        year = year[0:14]
        students = students[0:14]
    enroll = {"Year": year,
              "Number of Students": students}
    df = pd.DataFrame(data = enroll)

    # prepare data
    X = df[['Year']][::-1]
    y = df['Number of Students'][::-1]

    # split into training and test set 
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, random_state = 100)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 100)

    # train model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # evaluation metric
    mae_trn.append(mean_absolute_error(y_train, model.predict(X_train)))
    mae_val.append(mean_absolute_error(y_val, model.predict(X_val)))

pd.DataFrame({'Ethnicity': ethnicities,
              'Training MAE': mae_trn,
              'Validation MAE': mae_val})

Unnamed: 0,Ethnicity,Training MAE,Validation MAE
0,Caucasian,625.398356,1433.089425
1,Asian American,386.320151,1396.755054
2,African American,183.353144,281.454381
3,Hispanic,371.895246,624.047434
4,Native American,13.636192,25.730171
5,Hawaiian/Pacific Isl,5.502837,5.357496


In [16]:
from sklearn.linear_model import Lasso

mae_trn = []
mae_val = []

for ethnicity in ethnicities:
    year = []
    students = []
    for row in campus_total.iterrows():
        year.append(int(row[1]['Fall Term']))
        students.append(row[1][ethnicity])
    if (ethnicity == 'Hawaiian/Pacific Isl'): # no data for 2004 - 2009
        year = year[0:14]
        students = students[0:14]
    enroll = {"Year": year,
              "Number of Students": students}
    df = pd.DataFrame(data = enroll)

    # prepare data
    X = df[['Year']][::-1]
    y = df['Number of Students'][::-1]

    # split into training and test set 
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, random_state = 100)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 100)

    # train model
    model = Lasso()
    model.fit(X_train, y_train)

    # evaluation metric
    mae_trn.append(mean_absolute_error(y_train, model.predict(X_train)))
    mae_val.append(mean_absolute_error(y_val, model.predict(X_val)))

pd.DataFrame({'Ethnicity': ethnicities,
              'Training MAE': mae_trn,
              'Validation MAE': mae_val})

Unnamed: 0,Ethnicity,Training MAE,Validation MAE
0,Caucasian,625.401466,1432.85718
1,Asian American,386.289824,1397.027216
2,African American,183.30882,281.726542
3,Hispanic,371.884359,624.319596
4,Native American,13.603533,25.458009
5,Hawaiian/Pacific Isl,5.466593,5.290774


In [24]:
# Testing Hyperparamater: Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

mae_trn = []
mae_val = []

for ethnicity in ethnicities:
    year = []
    students = []
    for row in campus_total.iterrows():
        year.append(int(row[1]['Fall Term']))
        students.append(row[1][ethnicity])
    if (ethnicity == 'Hawaiian/Pacific Isl'): # no data for 2004 - 2009
        year = year[0:14]
        students = students[0:14]
    enroll = {"Year": year,
              "Number of Students": students}
    df = pd.DataFrame(data = enroll)

    # prepare data
    X = df[['Year']][::-1]
    y = df['Number of Students'][::-1]

    # split into training and test set 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 100)

    # train model
    model = RandomForestRegressor(max_depth = 9)
    model.fit(X_train, y_train)

    # evaluation metric
    mae_trn.append(mean_absolute_error(y_train, model.predict(X_train)))
    mae_val.append(mean_absolute_error(y_val, model.predict(X_val)))

pd.DataFrame({'Ethnicity': ethnicities,
              'Training MAE': mae_trn,
              'Validation MAE': mae_val})

Unnamed: 0,Ethnicity,Training MAE,Validation MAE
0,Caucasian,135.504286,308.773333
1,Asian American,70.570714,1181.52
2,African American,27.981429,119.426667
3,Hispanic,64.244286,399.826667
4,Native American,2.699286,19.95
5,Hawaiian/Pacific Isl,1.986667,5.4


In [31]:
# Testing Hyperparameter: XGBoost Regressor
import xgboost as xgb

mae_trn = []
mae_val = []

for ethnicity in ethnicities:
    year = []
    students = []
    for row in campus_total.iterrows():
        year.append(int(row[1]['Fall Term']))
        students.append(row[1][ethnicity])
    if (ethnicity == 'Hawaiian/Pacific Isl'): # no data for 2004 - 2009
        year = year[0:14]
        students = students[0:14]
    enroll = {"Year": year,
              "Number of Students": students}
    df = pd.DataFrame(data = enroll)

    # prepare data
    X = df[['Year']][::-1]
    y = df['Number of Students'][::-1]

    # split into training and test set 
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, random_state = 100)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 100)

    # train model
    model = xgb.XGBRegressor(learning_rate = 0.1)
    model.fit(X_train, y_train)

    # evaluation metric
    mae_trn.append(mean_absolute_error(y_train, model.predict(X_train)))
    mae_val.append(mean_absolute_error(y_val, model.predict(X_val)))

pd.DataFrame({'Ethnicity': ethnicities,
              'Training MAE': mae_trn,
              'Validation MAE': mae_val})

Unnamed: 0,Ethnicity,Training MAE,Validation MAE
0,Caucasian,2.609794,324.552083
1,Asian American,2.405797,997.284505
2,African American,0.614328,121.945801
3,Hispanic,2.255162,231.0896
4,Native American,0.068037,24.529076
5,Hawaiian/Pacific Isl,0.027271,7.985353


In [35]:
# Best Hyperparamater: Linear Regression
import warnings
warnings.filterwarnings("ignore")

mae_test = []

for ethnicity in ethnicities:
    year = []
    students = []
    for row in campus_total.iterrows():
        year.append(int(row[1]['Fall Term']))
        students.append(row[1][ethnicity])
    if (ethnicity == 'Hawaiian/Pacific Isl'): # no data for 2004 - 2009
        year = year[0:14]
        students = students[0:14]
    enroll = {"Year": year,
              "Number of Students": students}
    df = pd.DataFrame(data = enroll)

    # prepare data
    X = df[['Year']][::-1]
    y = df['Number of Students'][::-1]

    # split into training and test set 
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, random_state = 100)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 100)

    # train model
    model = LinearRegression()
    model.fit(np.concatenate((X_train, X_val), axis=0), np.concatenate((y_train, y_val), axis=0))

    # evaluation metric
    mae_test.append(mean_absolute_error(y_test, model.predict(X_test)))

pd.DataFrame({'Ethnicity': ethnicities,
              'Testing MAE': mae_test})

Unnamed: 0,Ethnicity,Testing MAE
0,Caucasian,1216.477632
1,Asian American,727.570396
2,African American,175.16205
3,Hispanic,292.834586
4,Native American,15.709756
5,Hawaiian/Pacific Isl,7.416206


In [36]:
# Best Hyperparamater: Random Forest Regressor (max depth = 5)
mae_test = []

for ethnicity in ethnicities:
    year = []
    students = []
    for row in campus_total.iterrows():
        year.append(int(row[1]['Fall Term']))
        students.append(row[1][ethnicity])
    if (ethnicity == 'Hawaiian/Pacific Isl'): # no data for 2004 - 2009
        year = year[0:14]
        students = students[0:14]
    enroll = {"Year": year,
              "Number of Students": students}
    df = pd.DataFrame(data = enroll)

    # prepare data
    X = df[['Year']][::-1]
    y = df['Number of Students'][::-1]

    # split into training and test set 
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, random_state = 100)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 100)

    # train model
    model = RandomForestRegressor(max_depth = 5)
    model.fit(np.concatenate((X_train, X_val), axis=0), np.concatenate((y_train, y_val), axis=0))

    # evaluation metric
    mae_test.append(mean_absolute_error(y_test, model.predict(X_test)))

pd.DataFrame({'Ethnicity': ethnicities,
              'Testing MAE': mae_test})

Unnamed: 0,Ethnicity,Testing MAE
0,Caucasian,522.775556
1,Asian American,427.53
2,African American,55.752778
3,Hispanic,98.686667
4,Native American,3.72
5,Hawaiian/Pacific Isl,3.84


In [37]:
# Best Hyperparamter: XGBoost Regressor (learning rate = 0.1)
mae_test = []

for ethnicity in ethnicities:
    year = []
    students = []
    for row in campus_total.iterrows():
        year.append(int(row[1]['Fall Term']))
        students.append(row[1][ethnicity])
    if (ethnicity == 'Hawaiian/Pacific Isl'): # no data for 2004 - 2009
        year = year[0:14]
        students = students[0:14]
    enroll = {"Year": year,
              "Number of Students": students}
    df = pd.DataFrame(data = enroll)

    # prepare data
    X = df[['Year']][::-1]
    y = df['Number of Students'][::-1]

    # split into training and test set 
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, random_state = 100)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 100)

    # train model
    model = xgb.XGBRegressor(learning_rate = 0.1)
    model.fit(np.concatenate((X_train, X_val), axis=0), np.concatenate((y_train, y_val), axis=0))

    # evaluation metric
    mae_test.append(mean_absolute_error(y_test, model.predict(X_test)))

pd.DataFrame({'Ethnicity': ethnicities,
              'Testing MAE': mae_test})

Unnamed: 0,Ethnicity,Testing MAE
0,Caucasian,415.287109
1,Asian American,443.22998
2,African American,93.533447
3,Hispanic,251.226074
4,Native American,5.013077
5,Hawaiian/Pacific Isl,5.659305


In [41]:
# Testing Training Size: Linear Regression
import warnings
warnings.filterwarnings("ignore")

sizes = [0.1, 0.25, 0.5, 1]
mae_test_all = []

for size in sizes:
    mae_test = []
    for ethnicity in ethnicities:
        year = []
        students = []
        for row in campus_total.iterrows():
            year.append(int(row[1]['Fall Term']))
            students.append(row[1][ethnicity])
        if (ethnicity == 'Hawaiian/Pacific Isl'): # no data for 2004 - 2009
            year = year[0:14]
            students = students[0:14]
        enroll = {"Year": year,
                "Number of Students": students}
        df = pd.DataFrame(data = enroll)

        # prepare data
        X = df[['Year']][::-1]
        y = df['Number of Students'][::-1]

        # split into training and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = size, random_state = 100)

        # train model
        model = LinearRegression()
        model.fit(X_train, y_train)

        # evaluation metric
        mae_test.append(mean_absolute_error(y_test, model.predict(X_test)))
    mae_test_all.append(mae_test)

pd.DataFrame({'Ethnicity': ethnicities,
              '10%': mae_test_all[0],
              '25%': mae_test_all[1],
              '50%': mae_test_all[2],
              '100%': mae_test_all[3]})

Unnamed: 0,Ethnicity,10%,25%,50%,100%
0,Caucasian,1064.722222,930.320537,1123.404391,2140.789474
1,Asian American,1258.055556,718.148813,1032.806587,1721.526316
2,African American,505.8,204.658824,266.566467,326.105263
3,Hispanic,957.722222,413.027657,593.365868,1299.631579
4,Native American,39.166667,17.899278,17.828144,34.578947
5,Hawaiian/Pacific Isl,12.846154,7.542424,6.998908,12.846154


In [42]:
# Testing Training Size: Random Forest Regressor (max depth = 5)
sizes = [0.1, 0.25, 0.5, 1]
mae_test_all = []

for size in sizes:
    mae_test = []
    for ethnicity in ethnicities:
        year = []
        students = []
        for row in campus_total.iterrows():
            year.append(int(row[1]['Fall Term']))
            students.append(row[1][ethnicity])
        if (ethnicity == 'Hawaiian/Pacific Isl'): # no data for 2004 - 2009
            year = year[0:14]
            students = students[0:14]
        enroll = {"Year": year,
                "Number of Students": students}
        df = pd.DataFrame(data = enroll)

        # prepare data
        X = df[['Year']][::-1]
        y = df['Number of Students'][::-1]

        # split into training and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = size, random_state = 100)

        # train model
        model = RandomForestRegressor(max_depth = 5)
        model.fit(X_train, y_train)

        # evaluation metric
        mae_test.append(mean_absolute_error(y_test, model.predict(X_test)))
    mae_test_all.append(mae_test)

pd.DataFrame({'Ethnicity': ethnicities,
              '10%': mae_test_all[0],
              '25%': mae_test_all[1],
              '50%': mae_test_all[2],
              '100%': mae_test_all[3]})

Unnamed: 0,Ethnicity,10%,25%,50%,100%
0,Caucasian,1919.85,1025.944667,426.632,2140.789474
1,Asian American,1771.772222,972.032,1000.916,1721.526316
2,African American,245.893333,146.642667,130.477,326.105263
3,Hispanic,1328.168333,612.034667,599.8255,1299.631579
4,Native American,27.511111,13.535333,8.73,34.578947
5,Hawaiian/Pacific Isl,12.846154,8.935455,6.274286,12.846154


In [43]:
# Testing Training Size: XGBoost Regressor (learing rate = 0.1)
sizes = [0.1, 0.25, 0.5, 1]
mae_test_all = []

for size in sizes:
    mae_test = []
    for ethnicity in ethnicities:
        year = []
        students = []
        for row in campus_total.iterrows():
            year.append(int(row[1]['Fall Term']))
            students.append(row[1][ethnicity])
        if (ethnicity == 'Hawaiian/Pacific Isl'): # no data for 2004 - 2009
            year = year[0:14]
            students = students[0:14]
        enroll = {"Year": year,
                "Number of Students": students}
        df = pd.DataFrame(data = enroll)

        # prepare data
        X = df[['Year']][::-1]
        y = df['Number of Students'][::-1]

        # split into training and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = size, random_state = 100)

        # train model
        model = xgb.XGBRegressor(learning_rate = 0.1)
        model.fit(X_train, y_train)

        # evaluation metric
        mae_test.append(mean_absolute_error(y_test, model.predict(X_test)))
    mae_test_all.append(mae_test)

pd.DataFrame({'Ethnicity': ethnicities,
              '10%': mae_test_all[0],
              '25%': mae_test_all[1],
              '50%': mae_test_all[2],
              '100%': mae_test_all[3]})

Unnamed: 0,Ethnicity,10%,25%,50%,100%
0,Caucasian,1562.682292,1201.646615,465.49707,2140.789474
1,Asian American,1665.602214,1084.00485,992.773389,1721.526316
2,African American,291.96091,132.271224,120.66438,326.105263
3,Hispanic,1245.22819,660.095866,524.627515,1299.631579
4,Native American,21.285077,15.01431,13.440965,34.578947
5,Hawaiian/Pacific Isl,12.846154,12.450599,11.555214,12.846154
