In [1]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', 100)

In [2]:
# Set random seed for reproducibility
np.random.seed(42)

# Number of records
num_records = 3587

# Generate synthetic student numbers (5-digit numbers)
student_studentnumber = np.random.randint(10000, 99999, num_records)
print(student_studentnumber.shape)
student_studentnumber

(3587,)


array([25795, 10860, 86820, ..., 18244, 24743, 56101])

In [3]:
# High School Graduation Dates (Sampled from given distribution)
grad_dates = pd.date_range(start="2015-01-01", end="2024-12-31", freq="D")
high_school_grad_date = np.random.choice(grad_dates, num_records)
print(high_school_grad_date.shape)
high_school_grad_date

(3587,)


array(['2017-11-19T00:00:00.000000000', '2022-07-13T00:00:00.000000000',
       '2021-04-26T00:00:00.000000000', ...,
       '2018-10-22T00:00:00.000000000', '2024-04-12T00:00:00.000000000',
       '2015-05-24T00:00:00.000000000'], dtype='datetime64[ns]')

In [4]:
# Graduated College (Using given proportions)
graduated_college = np.random.choice([np.nan, 0, 1], num_records, p=[1700/3587, 1502/3587, 385/3587])
print(graduated_college.shape)
graduated_college

(3587,)


array([ 0.,  0.,  0., ...,  0., nan, nan])

In [5]:
# Cohort Distribution
cohort_values = [2024, 2020, 2023, 2022, 2021, 2018, 2019, 2016, 2017, 2015]
cohort_probs = [437, 402, 395, 371, 365, 344, 342, 320, 317, 294]
cohort = np.random.choice(cohort_values, num_records, p=np.array(cohort_probs) / sum(cohort_probs))
print(cohort.shape)
cohort

(3587,)


array([2023, 2024, 2020, ..., 2020, 2024, 2017])

In [6]:
# CE Student Distribution
ce_student = np.random.choice([0, 1], num_records, p=[2561/3587, 1026/3587])
print(ce_student.shape)
ce_student

(3587,)


array([0, 0, 0, ..., 0, 0, 0])

In [7]:
# School Names
school_names = ["Highland Ridge High School", "Cedar Valley High School", "Summit Career Academy", "Evergreen Virtual Academy", "Rock Creek Youth Center"]
school_probs = [700, 694, 754, 712, 727]
sch_name = np.random.choice(school_names, num_records, p=np.array(school_probs) / sum(school_probs))
print(sch_name.shape)
sch_name

(3587,)


array(['Summit Career Academy', 'Rock Creek Youth Center',
       'Rock Creek Youth Center', ..., 'Evergreen Virtual Academy',
       'Evergreen Virtual Academy', 'Highland Ridge High School'],
      dtype='<U26')

In [8]:
# Gender Distribution
student_gender = np.random.choice([1, 0], num_records, p=[1860/3587, 1727/3587])
print(student_gender.shape)
student_gender

(3587,)


array([1, 1, 1, ..., 0, 0, 0])

In [11]:
# Race/Ethnicity Distribution
race_ethnicity = ["Hispanic", "Black or African American", "White", "Two or More Races", "Asian", "Other"]
race_probs = [1736, 814, 622, 243, 107, 65]
student_raceethnicity = np.random.choice(race_ethnicity, num_records, p=np.array(race_probs) / sum(race_probs))
print(student_raceethnicity.shape)
student_raceethnicity

(3587,)


array(['White', 'White', 'Hispanic', ..., 'White', 'Hispanic', 'Hispanic'],
      dtype='<U25')

In [13]:
# Special Education
activeenrollment_specialedstatus = np.random.choice([0, 1], num_records, p=[3329/3587, 258/3587])
print(activeenrollment_specialedstatus.shape)
activeenrollment_specialedstatus

(3587,)


array([0, 0, 0, ..., 0, 0, 0])

In [14]:
# Language Proficiency
customstudent_clde_languageproficiency = np.random.choice([0, 1], num_records, p=[3025/3587, 562/3587])
print(customstudent_clde_languageproficiency.shape)
customstudent_clde_languageproficiency

(3587,)


array([0, 0, 1, ..., 0, 0, 0])

In [17]:
# Gifted Status
gifted_student = np.random.choice([0, 1], num_records, p=[3398/3587, 189/3587])
print(gifted_student.shape)
gifted_student

(3587,)
[0 0 0 0 0 0 0 0 0 0]


In [20]:
# Generate 20 unique fake college names
prefixes = ["Western", "Northern", "Southern", "Eastern", "Central", "Great Lakes", 
            "Rocky Mountain", "Pacific", "Midwest", "Lakeshore", "Blue Ridge", "Sunset"]
suffixes = ["University", "College", "Institute", "Academy", "State University", 
            "Polytechnic", "Technology Institute", "Conservatory"]

fake_colleges = list(set(f"{random.choice(prefixes)} {random.choice(suffixes)}" for _ in range(20)))

# Assign a random college from the 20 options to each student
college_assignments = np.random.choice(fake_colleges, 3587)

print(college_assignments.shape)
college_assignments

(3587,)


array(['Western Technology Institute', 'Midwest Conservatory',
       'Rocky Mountain College', ..., 'Pacific University',
       'Sunset Academy', 'Central State University'], dtype='<U28')