In [1]:
import numpy as np
import pandas as pd
from typing import List

Data Sources:
- https://bmccprodstroac.blob.core.windows.net/uploads/2023/05/Institutional-Research-Fact-Sheet-Fall-2022-01.pdf
- https://bmccprodstroac.blob.core.windows.net/uploads/ported/iresearch/upload/IEReportCardWebJULY2018.pdf
- https://nces.ed.gov/collegenavigator/?q=190521&s=all&id=190521
- Tableau : https://public.tableau.com/app/profile/bmcc.oiea/viz/BMCCDataDashboards/Welcome

In [None]:
def simulate_student_profile(
        freshman: float,
        first_generation: float,
        full_time: float, # probability,
        cs_degree: float, # probability
        pell_grant: float, # probability
        employed: float, # probability
        age_mean: float, # mean age
        gpa_mean: float, # mean age
        days_missed: float, # days of school missed
        borough_p: List[float], # bx, bk, mn, qu, si, out [0.14, 0.21, 0.12, 0.13, 0.02, 0.38]
        grade: str, # grade
        n: int) -> pd.DataFrame:

    df = pd.DataFrame()

    # borough
    boroughs = ["Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island", "Outside NYC"]
    
    # check input probabilities add up to 1
    assert sum(borough_p) == 1
    df["borough"] = np.random.choice(boroughs, n, p=borough_p)

    # age
    df["age"] = np.random.normal(age_mean, 4, n)

    # gpa
    df["gpa"] = np.random.normal(gpa_mean, 0.5, n)

    # days missed
    df["days_missed"] = np.random.normal(days_missed, 4, n)

    # major (degree type?)
    majors = ["Computer Science", "Computer Information Systems"]
    major_p = [cs_degree, 1 - cs_degree]
    df["major"] = np.random.choice(majors, n, p=major_p)

    # full-time
    df["full_time"] = np.random.binomial(1, full_time, n)
    
    # freshman
    df["freshman"] = np.random.binomial(1, freshman, n)

    # first-generation college
    df["first_generation"] = np.random.binomial(1, first_generation, n)

    # employed
    df["employed"] = np.random.binomial(1, employed, n)

    # pell grant recipient
    df["pell_grant"] = np.random.binomial(1, pell_grant, n)

    df["grade"] = grade

    return df


In [12]:
"""
Overall stats:
17k students
38.6% DWFI rate in gateway courses
55% pell
29% TAP
22% first time freshmen
70% FT
63% freshmen, 37% sophomore
"""

students = 17000

passing_grade = simulate_student_profile(
    freshman=0.7,
    first_generation=0.62,
    full_time=0.7,
    cs_degree=0.3,
    pell_grant=0.5,
    employed=0.75,
    age_mean=23,
    gpa_mean=3.0,
    days_missed=6,
    borough_p=[0.14, 0.21, 0.12, 0.13, 0.02, 0.38],
    grade="P",
    n=round(students * 0.62))

dwfi_grade = simulate_student_profile(
    freshman=0.65,
    first_generation=0.67,
    full_time=0.6,
    cs_degree=0.2,
    pell_grant=0.53,
    employed=0.81,
    age_mean=22,
    gpa_mean=2.8,
    days_missed=14,
    borough_p=[0.12, 0.10, 0.10, 0.11, 0.02, 0.55],
    grade="DWFI",
    n=round(students * 0.38))

all_students = pd.concat([passing_grade, dwfi_grade])

In [13]:
all_students.to_csv("students_sim.csv", index=False)