In [1]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', 100)

In [2]:
# Set random seed for reproducibility
np.random.seed(42)

# Number of records
num_records = 3587

# Generate synthetic student numbers (5-digit numbers)
student_studentnumber = np.random.randint(10000, 99999, num_records)
print(student_studentnumber.shape)
student_studentnumber

(3587,)


array([25795, 10860, 86820, ..., 18244, 24743, 56101])

In [3]:
# High School Graduation Dates (Sampled from given distribution)
grad_dates = pd.date_range(start="2015-01-01", end="2024-12-31", freq="D")
high_school_grad_date = np.random.choice(grad_dates, num_records)
print(high_school_grad_date.shape)
high_school_grad_date

(3587,)


array(['2017-11-19T00:00:00.000000000', '2022-07-13T00:00:00.000000000',
       '2021-04-26T00:00:00.000000000', ...,
       '2018-10-22T00:00:00.000000000', '2024-04-12T00:00:00.000000000',
       '2015-05-24T00:00:00.000000000'], dtype='datetime64[ns]')

In [4]:
# Graduated College (Using given proportions)
graduated_college = np.random.choice([np.nan, 0, 1], num_records, p=[1700/3587, 1502/3587, 385/3587])
print(graduated_college.shape)
graduated_college

(3587,)


array([ 0.,  0.,  0., ...,  0., nan, nan])

In [5]:
# Cohort Distribution
cohort_values = [2024, 2020, 2023, 2022, 2021, 2018, 2019, 2016, 2017, 2015]
cohort_probs = [437, 402, 395, 371, 365, 344, 342, 320, 317, 294]
cohort = np.random.choice(cohort_values, num_records, p=np.array(cohort_probs) / sum(cohort_probs))
print(cohort.shape)
cohort

(3587,)


array([2023, 2024, 2020, ..., 2020, 2024, 2017])

In [6]:
# CE Student Distribution
ce_student = np.random.choice([0, 1], num_records, p=[2561/3587, 1026/3587])
print(ce_student.shape)
ce_student

(3587,)


array([0, 0, 0, ..., 0, 0, 0])

In [7]:
# School Names
school_names = ["Highland Ridge High School", "Cedar Valley High School", "Summit Career Academy", "Evergreen Virtual Academy", "Rock Creek Youth Center"]
school_probs = [700, 694, 754, 712, 727]
sch_name = np.random.choice(school_names, num_records, p=np.array(school_probs) / sum(school_probs))
print(sch_name.shape)
sch_name

(3587,)


array(['Summit Career Academy', 'Rock Creek Youth Center',
       'Rock Creek Youth Center', ..., 'Evergreen Virtual Academy',
       'Evergreen Virtual Academy', 'Highland Ridge High School'],
      dtype='<U26')

In [8]:
# Gender Distribution
student_gender = np.random.choice([1, 0], num_records, p=[1860/3587, 1727/3587])
print(student_gender.shape)
student_gender

(3587,)


array([1, 1, 1, ..., 0, 0, 0])

In [9]:
# Race/Ethnicity Distribution
race_ethnicity = ["Hispanic", "Black or African American", "White", "Two or More Races", "Asian", "Other"]
race_probs = [1736, 814, 622, 243, 107, 65]
student_raceethnicity = np.random.choice(race_ethnicity, num_records, p=np.array(race_probs) / sum(race_probs))
print(student_raceethnicity.shape)
student_raceethnicity

(3587,)


array(['Other', 'Other', 'Hispanic', ..., 'Two or More Races', 'Hispanic',
       'Hispanic'], dtype='<U25')

In [10]:
# Special Education
activeenrollment_specialedstatus = np.random.choice([0, 1], num_records, p=[3329/3587, 258/3587])
print(activeenrollment_specialedstatus.shape)
activeenrollment_specialedstatus

(3587,)


array([0, 0, 0, ..., 0, 0, 0])

In [11]:
# Language Proficiency
customstudent_clde_languageproficiency = np.random.choice([0, 1], num_records, p=[3025/3587, 562/3587])
print(customstudent_clde_languageproficiency.shape)
customstudent_clde_languageproficiency

(3587,)


array([0, 1, 0, ..., 0, 0, 0])

In [12]:
# Gifted Status
gifted_student = np.random.choice([0, 1], num_records, p=[3398/3587, 189/3587])
print(gifted_student.shape)
gifted_student

(3587,)


array([0, 0, 0, ..., 0, 0, 0])

In [13]:
# Generate 20 unique fake college names
prefixes = ["Western", "Northern", "Southern", "Eastern", "Central", "Great Lakes", 
            "Rocky Mountain", "Pacific", "Midwest", "Lakeshore", "Blue Ridge", "Sunset"]
suffixes = ["University", "College", "Institute", "Academy", "State University", 
            "Polytechnic", "Technology Institute", "Conservatory"]

fake_colleges = list(set(f"{random.choice(prefixes)} {random.choice(suffixes)}" for _ in range(20)))

# Assign a random college from the 20 options to each student
first_college_name = np.random.choice(fake_colleges, 3587)
last_college_name = np.random.choice(fake_colleges, 3587)

print(first_college_name.shape)
print(first_college_name[:10])
print(last_college_name.shape)
print(last_college_name[:10])

(3587,)
['Midwest State University' 'Midwest State University' 'Southern College'
 'Rocky Mountain Conservatory' 'Eastern State University'
 'Eastern State University' 'Lakeshore Technology Institute'
 'Southern College' 'Great Lakes University' 'Great Lakes University']
(3587,)
['Great Lakes State University' 'Western University' 'Northern University'
 'Southern Academy' 'Western University' 'Rocky Mountain Conservatory'
 'Eastern State University' 'Lakeshore Technology Institute'
 'Eastern Institute' 'Midwest State University']


In [14]:
# Define states and their corresponding frequencies
states = ["CO", "AZ", "KS", "TX", "CA", "FL", "NM", "MD", "IA", "IL", "VA"]
state_probs = [1578, 28, 25, 25, 23, 21, 16, 14, 12, 9, 8]

# Normalize probabilities to sum to 1
state_probs = np.array(state_probs) / sum(state_probs)

# Generate random states (No NaN values)
num_records = 3587
first_college_state = np.random.choice(states, num_records, p=state_probs)
last_college_state = np.random.choice(states, num_records, p=state_probs)

# Print results
print(first_college_state.shape)
print(first_college_state[:10])
print(last_college_state.shape)
print(last_college_state[:10])

(3587,)
['CO' 'CO' 'CO' 'CO' 'CO' 'CO' 'CO' 'CO' 'CO' 'CO']
(3587,)
['IL' 'CO' 'CO' 'CO' 'CO' 'CO' 'CO' 'CO' 'CO' 'CO']


In [15]:
# Define year range
years = np.arange(2015, 2025)  # Covers 2015-2024

# Generate possible August and January enrollment dates
august_dates = [pd.Timestamp(f"{year}-08-{day}") for year in years for day in range(10, 26)]  # Aug 10-25
january_dates = [pd.Timestamp(f"{year}-01-{day}") for year in years for day in range(10, 21)]  # Jan 10-20

# Combine all possible enrollment dates
enrollment_dates = august_dates + january_dates

# Ensure reasonable weighting (80% August, 20% January)
weights = [0.8 / len(august_dates)] * len(august_dates) + [0.2 / len(january_dates)] * len(january_dates)

# Generate first enrollment dates
num_records = 3587  # Adjust as needed
first_enrollment_begin = np.random.choice(enrollment_dates, num_records, p=weights)

# Convert to pandas Series if needed
first_enrollment_begin = pd.Series(first_enrollment_begin)

# Preview results
print(first_enrollment_begin.shape)
print(first_enrollment_begin[:5])

(3587,)
0   2019-01-14
1   2017-08-14
2   2020-08-25
3   2015-08-18
4   2015-08-14
dtype: datetime64[ns]


In [16]:
# Generate first enrollment end date correctly
def get_random_may_date(start_date):
    if start_date.month == 1:  # January enrollment
        may_year = start_date.year  # Same year
    else:  # August enrollment
        may_year = start_date.year + 1  # Next year

    may_dates = pd.date_range(start=f"{may_year}-05-01", end=f"{may_year}-05-31")
    return np.random.choice(may_dates)

first_enrollment_end = first_enrollment_begin.apply(get_random_may_date)
print(first_enrollment_end.shape)
print(first_enrollment_end[:5])

(3587,)
0   2019-05-20
1   2018-05-15
2   2021-05-17
3   2016-05-19
4   2016-05-03
dtype: datetime64[ns]
