In [1]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', 100)

In [2]:
# Set random seed for reproducibility
np.random.seed(42)

# Number of records
num_records = 3587

# Generate synthetic student numbers (5-digit numbers)
student_studentnumber = np.random.randint(10000, 99999, num_records)
print(student_studentnumber.shape)
student_studentnumber

(3587,)


array([25795, 10860, 86820, ..., 18244, 24743, 56101])

In [3]:
# High School Graduation Dates (Sampled from given distribution)
grad_dates = pd.date_range(start="2015-01-01", end="2024-12-31", freq="D")
high_school_grad_date = np.random.choice(grad_dates, num_records)
print(high_school_grad_date.shape)
high_school_grad_date

(3587,)


array(['2017-11-19T00:00:00.000000000', '2022-07-13T00:00:00.000000000',
       '2021-04-26T00:00:00.000000000', ...,
       '2018-10-22T00:00:00.000000000', '2024-04-12T00:00:00.000000000',
       '2015-05-24T00:00:00.000000000'], dtype='datetime64[ns]')

In [4]:
# Graduated College (Using given proportions)
graduated_college = np.random.choice([np.nan, 0, 1], num_records, p=[1700/3587, 1502/3587, 385/3587])
print(graduated_college.shape)
graduated_college

(3587,)


array([ 0.,  0.,  0., ...,  0., nan, nan])

In [5]:
# Cohort Distribution
cohort_values = [2024, 2020, 2023, 2022, 2021, 2018, 2019, 2016, 2017, 2015]
cohort_probs = [437, 402, 395, 371, 365, 344, 342, 320, 317, 294]
cohort = np.random.choice(cohort_values, num_records, p=np.array(cohort_probs) / sum(cohort_probs))
print(cohort.shape)
cohort

(3587,)


array([2023, 2024, 2020, ..., 2020, 2024, 2017])