# Main Document to Load and Traverse Data
# NOTE: THIS FILE IS EXPLORATORY, AND NOT IN USE FROM A SIMULATION PERSPECTIVE

> Expected Data: school admissions preferences, number of students admitted per attribute category, etc.

In [2]:
import pandas as pd
import numpy as np
import os
import random

import uuid
import editdistance
from difflib import SequenceMatcher

from tqdm import tqdm

cwd = os.getcwd()
testing = False  # ******

# TODO: add oneshot function with hyperparemeters controlling the policy of school/student's generated

### Helper Functions

In [3]:
stoi_limit = 2**32 - 1
def seed_stoi(s):
    """ somewhat janky conversion of string to int for numpy random states 
    (since np.random.seed() cant take a string) """
    res = 1
    for c in s:
        res += abs(ord(c))
        if res > stoi_limit:
            res -= 2*abs(ord(c))
    return res


def match_schools(all_schools, lst, use_edit=False):
    """ matches schools based on edit distance """
    result = []
    for name in lst:
        if use_edit:  # Levenshtein distance metric for matching schools
            dists = [(str(schol), editdistance.eval(name, schol.name)) for schol in all_schools.values()]
            result.append(min(dists, key=lambda x: x[1])[0])
        else:
            dists = [(str(schol), SequenceMatcher(None, name, schol.name).ratio()) for schol in all_schools.values()]
            result.append(max(dists, key=lambda x: x[1])[0])  # add the school with the min edit distance
    return result


def add_fake_gpa(students):
    """ adds a gpa attribute based on the students ELA and Math scores """
    students["fake_gpa"] = students["Math_score"] + students["ELA_score"]
    max_score = max(students["fake_gpa"])
    students["fake_gpa"] = students["fake_gpa"].apply(lambda score: round(4*score/max_score, 2))  # gpa-normalized
    return students

def get_distributions():
    """ returns quantile distribution of gpa based on real data 
    used for seat/screen calculations """
    student_df = pd.read_csv(cwd+"/Data/student_info_with_demographics.csv")
    student_df = add_fake_gpa(student_df)
    
    edopt_dist = [student_df["fake_gpa"].quantile(i/3) for i in range(2, 0, -1)]
    screen_dist = [student_df["fake_gpa"].quantile(i/5) for i in range(4, 0, -1)]
    return edopt_dist, screen_dist
# edopt_dist, screen_dist = get_distributions()
def set_place(x, dist):
    """ returns a number representing which placement of a number x based on a distribution dist """
    for i, num in enumerate(dist):
        if x >= num:
            return i+1
    return len(dist)+1

# distributions according to: https://www.schools.nyc.gov/enrollment/enroll-grade-by-grade/high-school
screen_dist = [94, 89.66, 82.75, 76.33]  # https://www.schools.nyc.gov/enrollment/enroll-grade-by-grade/high-school/screened-admissions
edopt_dist = [88.25, 77.5]  #https://www.schools.nyc.gov/enrollment/enroll-grade-by-grade/high-school/educational-option-ed-opt-admissions-method

print("EDOPT DISTRIBUTION: ", edopt_dist)
print("SCREEN DISTRIBUTION: ", screen_dist)


# USED FOR SEAT/SCREEN CALCULATIONS
seated = lambda x: set_place(x, edopt_dist) 
screened = lambda x: set_place(x, screen_dist)

EDOPT DISTRIBUTION:  [88.25, 77.5]
SCREEN DISTRIBUTION:  [94, 89.66, 82.75, 76.33]


## Main Classes

In [4]:
MAX_NUM_SCHOOLS = 12

class Student:
    def __init__(self, seed, lottery="", schools=[], score=-1):
        # necessary info
        self.id = seed
        random.seed(seed) # set the seed
        
        if lottery != "": # signifies that this is a real student whose data we are inputting
            self.name = seed
            self.lottery = lottery
            self.schools = schools  # Note: can ignore given current simulation code
            self.num_schools = len(schools)
            
            self.selection_policy = -1
            self.ranking_policy = -1
        else:
            self.lottery = uuid.UUID(int=random.getrandbits(128), version=4).hex  # note: can do duplicate check if necessary

            seed_nums = random.sample(range(1, 3), 2)  # necessary to get 2 independent numbers
            # integer representation of how likely a student is to select a given school
            self.selection_policy = seed_nums[0]
            # integer representation of how highly a student is to rank a school, given they've already selected it
            self.ranking_policy = seed_nums[1]

            self.num_schools = MAX_NUM_SCHOOLS
            self.schools = []  # to be updated later based on self.selection_policy
            self.name = ""
        
        # calculate buckets for score-based decisions
        self.test_score = score if score != -1 else self.get_rand_score(seed)
        self.seat = seated(self.test_score)
        self.screen = screened(self.test_score)
        
        # extra / potential additions
        self.district = 0
        self.borough = None
        self.location = None
        
    def get_rand_score(self, new_seed):
        """ generates a random gpa based on assumed normal distribution of nyc highschool data """
        random.seed(new_seed)
        num = round(np.random.normal(68, 8.89, 1)[0], 2)
        if num < 0:
            num = 0
        elif num > 100:
            num = 100
        return num
    
    def update_choice(self, schools):
        """ used to update a student object with custom school information """
        self.schools = schools
        self.num_schools = len(schools)
        
    def info(self, simple=False):
        """ shows basic information generated from seed/id """
        basics = "ID: " + self.id + "\nLottery: " + self.lottery
        scores = "\nScores: " + str([self.test_score, self.seat, self.screen])
        policies = "\nPolicies: " + str([self.selection_policy, self.ranking_policy])
        if simple:
            return basics
        else:
            return basics + scores + policies
        
    def to_list(self):
        """ export all data for dataframe conversion """
        return [self.id, self.lottery, self.selection_policy, self.ranking_policy, self.num_schools, 
                self.test_score, self.seat, self.screen,
                self.name, self.district, self.borough, self.location]
        
    def __str__(self):
        """ return unique hashable identifier """
        return self.id
#     sorted(students, key=lambda x: int(x.lottery, 16))
        

# stats on nyc schools
mean_school_cap = 145  # capacity
std_school_cap = 128.5
mean_ens = 2.453505  # ens = expected number of students applied per seat
std_ens = 4.072874
class School:
    def __init__(self, seed, cap=-1, pop=-1, like=-1, name=""):
        # necessary info
        self.dbn = seed  # ID
        random.seed(seed) # set the seed
        
        self.policy = random.randint(1,3)
        self.capacity = self.get_rand_cap(seed) if cap<0 else cap
        
        # NOTE: popularity is how likely a school is to be on a student's list
        if pop == -1:  # weighted by mean & std of popularity / mean capacity
            np.random.seed(seed_stoi(seed))
            new_pop = np.random.normal(mean_ens, std_ens, 1)[0]
            if new_pop < 0:
                new_pop = 0
            self.popularity = round(self.capacity * new_pop, 2)
        else:
            self.popularity = pop
        # NOTE: likeability is how high on a students list a given school should appear
        self.likeability = random.randint(1,MAX_NUM_SCHOOLS) if like == -1 else like    
        
        # extra info
        self.name = name
        self.district = 0
        self.borough = None
        self.location = None
        
    def get_rand_cap(self, new_seed):
        """ generate a random school capacity based on normal distribution of nyc highschool data """
        np.random.seed(seed_stoi(new_seed))
        num = round(np.random.normal(mean_school_cap, std_school_cap, 1)[0])  
        if num < 1:
            num = 1
        return num    
        
    def info(self, simple=False):
        """ display basic info """
        basics = "ID: " + self.dbn
        if self.name != "":
            basics += ", Name: " + self.name
        basics += "\nPolicy: " + str(self.policy)
        additional = "\nCap/Pop/Like: " + str([self.capacity, self.popularity, self.likeability])
        return basics + additional if not simple else ""
    
    def to_list(self):
        """ export all information"""
        return [self.dbn, self.capacity, self.policy, self.popularity, self.likeability,
                self.name, self.district, self.borough, self.location]
        
    def __str__(self):
        """ return unique hashable id """
        return self.dbn
    
""" AUTHOR'S NOTE:
These structures are meant to loosely mirror the relationship between schools and students
in the highschool matching process...
- for reproducability sake, both objects require a seed.__str__() that will determine its random state
"""

# just making sure everything is generated properly
tester = Student("student test")
print(tester.info())
print()
tester2 = School("school test1")
print(tester2.info())

ID: student test
Lottery: 062cb39d480343f5a4ed5796933fca98
Scores: [80.15, 2, 4]
Policies: [1, 2]

ID: school test1
Policy: 2
Cap/Pop/Like: [146, 363.05, 1]


## Generation Functions
> with random seeds

In [5]:
# 2022 default = 71250 students ranked alongside 437 schools
LARGE_NUM = 9999999999  # used to set upperbound for sampling range

def generate_students(seed, size=71250):
    """ generate students based on seeded random sampling """
    random.seed(seed)
    return [Student("Student #"+str(i)) for i in random.sample(range(LARGE_NUM*size), size)]

def generate_schools(seed, size=437):
    """ generate schools based on seeded random sampling """
    random.seed(seed)
    return [School("School #"+str(i)) for i in random.sample(range(LARGE_NUM*size), size)]

def generate_nyc_schools(seed, school_info_dir="Data/schools_info.npy"):
    """ generate schools based on nyc school applicant data """
    school_info = np.load(school_info_dir, allow_pickle=True).item()
    # {name: (dbn, capacity, true applicants, total applicants)}
    new_schools = []
    for name, info in school_info.items():  # NOTE: can add seeded variance in popularity & likeability
        tmp_like = 0 if info[3]==0 else round(info[2]/info[3], 3)  # applicant rate (proxy for true_ar)
        # NOTE: popularity can also be total_applicants/capacity but we want capacity-weighted for this implementaiton
        schol = School(info[0], info[1], info[3], tmp_like, name)
        new_schools.append(schol)
    random.seed(seed)
    random.shuffle(new_schools)  # shuffle because order can affect tiebreakers
    return new_schools
    
""" AUTHOR'S NOTE:
In a previous iteration the random sample was based on size to ensure reproducibilty...
For some reason the random state carried over from previous iterations resulting in non-random matches.
- to mitigate this the functions are now exclusively random
- if you'd like to reproduce a previous generation, simply generate students/schools with the same id/dbn
"""

#test
test_students = generate_students(1, 2)
for stud in test_students:
    print(stud.info())
    print()
test_schools = generate_nyc_schools(1)[:2] # generate_schools(1, 2)
for schol in test_schools:
    print(schol.name)
    print(schol.info())
    print()

ID: Student #17756959221
Lottery: 682842c1ebdd4701b299f9b6226ff07b
Scores: [65.39, 3, 5]
Policies: [2, 1]

ID: Student #3280387012
Lottery: 8153c77963fe44eb942e0c6fce9b322d
Scores: [70.63, 3, 5]
Policies: [2, 1]

Professional Performing Arts High School
ID: 02M408, Name: Professional Performing Arts High School
Policy: 3
Cap/Pop/Like: [124, 1160, 0.34]

New Directions Secondary School
ID: 09X350, Name: New Directions Secondary School
Policy: 2
Cap/Pop/Like: [25, 69, 0.145]



## Main Simulation Code

In [6]:
# SIMULATIONS
def simulate_student_choices(students, schools):
    """ takes in a dict of students and schools and generates choices based on student.selection_policy
     ordered by student.ranking_policy 
     Note: also outputs a school_to_student helper dict such that {school.dbn: set(school_id)}
     """
    choices = dict([[str(stud), []] for stud in students])
    
    # generate lists of schools for each strategy & shuffle
    full_random = list(schools.keys())
    popularity_weights = [schools[schol].popularity for schol in full_random]
    
    school_to_student = dict([[str(schol), set()] for schol in schools])  # helper for school choices
    for stud in tqdm(students.values()):
        # for each student, choose schools depending on strategy based on seed
        ranks = []
        random.seed(str(stud))
        if stud.selection_policy == 1:  # fully randomized
            ranks = random.choices(population=full_random, k=stud.num_schools)
        else:  # popularity weighted randomized
            ranks = random.choices(population=full_random, weights=popularity_weights, k=stud.num_schools)
        
        # order based on student policy
        if stud.ranking_policy != 1:  # rank by likeability
            ranks.sort(key=lambda schol: schools[schol].likeability, reverse=True)
            # NOTE: ties are handled by how the list was previously sorted which is depended on the seed
        choices[str(stud)] = ranks
        
        for schol_id in ranks:  # add ranks to helper
            school_to_student[schol_id].add(str(stud))
    return choices, school_to_student

def simulate_school_choices(school_to_student, students, schools):
    """ takes in a dict of students and schools as well as the student's rankings and generatings 
        the school's choices based on them
    """
    choices = {}
    # sort students by school policy
    for schol_id, student_ids in tqdm(school_to_student.items()):
        ranks = []
        if schools[schol_id].policy == 1:
            ranks = sorted(student_ids, key=lambda stud_id: students[stud_id].lottery)
        elif schools[schol_id].policy == 2:
            ranks = sorted(student_ids, key=lambda stud_id: (students[stud_id].seat, students[stud_id].lottery))
        else:
            ranks = sorted(student_ids, key=lambda stud_id: (students[stud_id].screen, students[stud_id].lottery))
        choices[schol_id] = ranks
    return choices

## *Oneshot Functions

In [7]:
def oneshot(seed, nyc=True, num_schools=437, num_students=71250, verbose=False):
    """ runs a basic simulation that generates schools and students and simulates their preferences:
        - student_preference_profile = {student_id : [school_ids_ranked]}
        - school_preference_profile = {school ids : [student_ids_ranked]} 
        - students = {student_id : [self.id, self.lottery, self.selection_policy, self.ranking_policy,
                                    self.num_schools,  self.test_score, self.seat, self.screen, 
                                    self.name, self.district, self.borough, self.location] }
        - schools = {school_dbn : [self.dbn, self.policy, self.popularity, self.likeability,
                                    self.name, self.district, self.borough, self.location] }
                                
        seed determines the simulation state,
        nyc determines if you want to use real nyc school data from 2023 (overrides num_schools on True)
        num_students & num_schools are self-explanatory
    """
    random.seed(seed)
    student_seed, school_seed = random.sample(range(LARGE_NUM), 2)
    if verbose: print("Base Seed", seed, "created student seed", student_seed, "and school seed", school_seed)
    
    # generate students and schools
    students = dict([[str(stud), stud] for stud in generate_students(student_seed, num_students)])
    if nyc:
        schools = dict([[str(schol), schol] for schol in generate_nyc_schools(school_seed)])
    else:
        schools = dict([[str(schol), schol] for schol in generate_schools(school_seed, num_schools)])
    if verbose: 
        total_seats = sum([schol.capacity for schol in schools.values()])
        print("Generated", len(schools), "schools and", len(students), "students with", total_seats, "seats\n")
    
    if verbose: print("Student choices simulating...")
    student_preference_profile, school_to_student = simulate_student_choices(students, schools)
    
    if verbose: print("School choices simulating...")
    # Note: no random seed since lottery# informs the tiebreaks
    school_preference_profile = simulate_school_choices(school_to_student, students, schools)
    
    return student_preference_profile, school_preference_profile, students, schools


def oneshot_with_input(seed, lottery_num, my_schools, gpa=-1, student_name="injected_student", verbose=False):
    """ runs a simulation based on NYC data with a single student as input
    Note: for student, lottery# & schools required, percentile scale gpa recommended, name possible
    """
    assert len(my_schools) <= MAX_NUM_SCHOOLS, "you can only rank "+str(MAX_NUM_SCHOOLS)+" schools"
    
    # tldr to a normal oneshot() simulation, then add in the student before the school preferences are simulated
    random.seed(seed)
    student_seed, school_seed = random.sample(range(LARGE_NUM), 2)
    if verbose: print("Base Seed", seed, "created student seed", student_seed, "and school seed", school_seed)
    
    students = dict([[str(stud), stud] for stud in generate_students(student_seed)])
    schools = dict([[str(schol), schol] for schol in generate_nyc_schools(school_seed)])
    if verbose: 
        total_seats = sum([schol.capacity for schol in schools.values()])
        print("Generated", len(schools), "schools and", len(students), "students with", total_seats, "seats\n")
    
    if verbose: print("Student choices simulating...")
    student_preference_profile, school_to_student = simulate_student_choices(students, schools)
    
    # inject student into data
    if verbose: print("Injecting new student into data...")
    choices = match_schools(schools, my_schools)  # match choices with schools in present data set
    new_student = Student(student_name, lottery_num, choices, gpa)  # make student obj
    students[str(new_student)] = new_student  # add student to data
    student_preference_profile[str(new_student)] = choices  # add student's list to preference profile
    for schol in choices:  # add student's choices to school dict
        school_to_student[schol].add(str(new_student))  # Note: make sure there is no student with an identical id
    
    if verbose: print("School choices simulating...")
    school_preference_profile = simulate_school_choices(school_to_student, students, schools)
    
    return student_preference_profile, school_preference_profile, students, schools

In [38]:
%%time

# TESTING
simstate = 1
# student_prefs, school_prefs, students, schools = oneshot(simstate, True)
student_prefs, school_prefs, students, schools = oneshot_with_input(simstate, "04628abdceed442c918f54ed2f39055e",
                                                        ["Quest to Learn", "East Side Community",
                                                        "Forsyth Satellite Academy",
                                                        "Lower Manhattan Arts Academyy"], 
                                                        90, "Kora", True)

Base Seed 1 created student seed 9167024629 and school seed 3280387012
Generated 439 schools and 71250 students with 72958 seats

Student choices simulating...


100%|██████████| 71250/71250 [00:02<00:00, 26821.11it/s]


Injecting new student into data...
School choices simulating...


100%|██████████| 439/439 [00:01<00:00, 319.35it/s]

CPU times: user 7.82 s, sys: 1.22 s, total: 9.03 s
Wall time: 10.4 s





### Sanity Checking & Evaluation

In [17]:
# ensuring injected student is in profile and their preferences are stored 
# for schol in student_prefs["Kora"]:
#     print(schools[schol].info())
#     print()

In [18]:
# Sanity Check:
zero_count = 0
avg_count = 0
for schol, studs in school_prefs.items():
    if len(studs) == 0:
        zero_count += 1
    avg_count += len(studs)
avg_count /= len(schools)
print("Simulation resulted in", zero_count, "of", len(schools), "schools with no student applications")
print("On average, each school ranked", round(avg_count), "of", len(students), "students")

Simulation resulted in 0 of 439 schools with no student applications
On average, each school ranked 1905 of 71250 students


In [19]:
# checking that students with policy[1]=2 have schools ranked from largest to smallest likeability
s = list(student_prefs.keys())[0]
print(students[s].info())
print("*\n")
for sch in student_prefs[s][:5]:
    print(schools[sch].info())
    print()

ID: Student #609348707388531
Lottery: 8b0c6057f77b4978b78dfc40c663e2e9
Scores: [77.29, 3, 4]
Policies: [1, 2]
*

ID: 02M545, Name: High School for Dual Language and Asian Studies
Policy: 2
Cap/Pop/Like: [108, 1048, 0.47]

ID: 25Q281, Name: East-West School of International Studies
Policy: 3
Cap/Pop/Like: [89, 763, 0.387]

ID: 19K683, Name: School for Classics High School
Policy: 1
Cap/Pop/Like: [108, 242, 0.302]

ID: 14K685, Name: El Puente Academy for Peace and Justice
Policy: 2
Cap/Pop/Like: [70, 168, 0.286]

ID: 17K543, Name: Science, Technology and Research Early College High School at Erasmus
Policy: 2
Cap/Pop/Like: [130, 810, 0.264]



In [20]:
# checking that schools with policy > 1 have students loosely ranked from largest to smallest test score
s = list(school_prefs.keys())[3]
print(schools[s].info())
print("*\n")
for stud in school_prefs[s][:10]:
    print(students[stud].info())
    print()

ID: 08X432, Name: Bronx Bridges High School
Policy: 3
Cap/Pop/Like: [81, 429, 0.159]
*

ID: Student #489904833490118
Lottery: 1c39e5ab88eb49a69e44a76cee140098
Scores: [95.1, 1, 1]
Policies: [1, 2]

ID: Student #285442825375764
Lottery: 37f37a9929394fea98722e877dd20045
Scores: [90.28, 1, 2]
Policies: [2, 1]

ID: Student #439100751271157
Lottery: 3ecb14c0c0904b06a0d8f9f07580ee00
Scores: [93.03, 1, 2]
Policies: [1, 2]

ID: Student #368538878316195
Lottery: 559f6b2ca0ee48e781bc8344c6cae3e1
Scores: [90.59, 1, 2]
Policies: [1, 2]

ID: Student #665203594395897
Lottery: 8b7459b025f24ca2a4604b7976cc103d
Scores: [91.5, 1, 2]
Policies: [1, 2]

ID: Student #3123722867502
Lottery: a122f232b4a74771bf93ddd59b2b47a1
Scores: [89.89, 1, 2]
Policies: [1, 2]

ID: Student #180893752055294
Lottery: a92e82de50b446239241052c7835e199
Scores: [92.2, 1, 2]
Policies: [1, 2]

ID: Student #84522400398245
Lottery: f5a731dfe42544229407b9cd2162b75a
Scores: [89.66, 1, 2]
Policies: [1, 2]

ID: Student #271256535681864
L

### Save Results

In [21]:
newpath = cwd + "/Data/Generated/simulation_results_rs" + str(simstate)
if not os.path.exists(newpath):
    os.makedirs(newpath)

np.save(newpath+"/student_rankings.npy", student_prefs)
np.save(newpath+"/school_rankings.npy", school_prefs)

list_students = dict([[str(stud), stud.to_list()] for stud in students.values()])
np.save(newpath+"/student_info.npy", list_students)
list_schools = dict([[str(schol), schol.to_list()] for schol in schools.values()])
np.save(newpath+"/school_info.npy", list_schools)
    
print("Simulation state", simstate, "saved at\n", newpath+'/')

Simulation state 1 saved at
 /Users/korahughes/Documents/GitHub/DataLife/BackEnd/Data/Generated/simulation_results_rs1/


### Running & Saving Multiple Simulation States

In [35]:
for auto_simstate in tqdm(range(1,5)):
    student_prefs, school_prefs, students, schools = oneshot(auto_simstate, True)  # run simulation
    
    newpath = cwd + "/Data/Generated/simulation_results_rs" + str(auto_simstate)
    if not os.path.exists(newpath):  # add folder for simulation
        os.makedirs(newpath)
    np.save(newpath+"/student_rankings.npy", student_prefs)  # save student/school preferences
    np.save(newpath+"/school_rankings.npy", school_prefs)

    list_students = dict([[str(stud), stud.to_list()] for stud in students.values()])  # transform & save the student/school information
    np.save(newpath+"/student_info.npy", list_students)
    list_schools = dict([[str(schol), schol.to_list()] for schol in schools.values()])
    np.save(newpath+"/school_info.npy", list_schools)
        
    print("Simulation state", auto_simstate, "saved at", newpath+'/')
    print()

100%|██████████| 71250/71250 [00:02<00:00, 25839.96it/s]
100%|██████████| 439/439 [00:01<00:00, 286.85it/s]
 25%|██▌       | 1/4 [00:09<00:28,  9.55s/it]

Simulation state 1 saved at /Users/korahughes/Documents/GitHub/DataLife/BackEnd/Data/Generated/simulation_results_rs1/



100%|██████████| 71250/71250 [00:02<00:00, 23959.35it/s]
100%|██████████| 439/439 [00:01<00:00, 224.87it/s]
 50%|█████     | 2/4 [00:19<00:19,  9.55s/it]

Simulation state 1 saved at /Users/korahughes/Documents/GitHub/DataLife/BackEnd/Data/Generated/simulation_results_rs2/



100%|██████████| 71250/71250 [00:02<00:00, 26227.58it/s]
100%|██████████| 439/439 [00:02<00:00, 213.09it/s]
 75%|███████▌  | 3/4 [00:28<00:09,  9.67s/it]

Simulation state 1 saved at /Users/korahughes/Documents/GitHub/DataLife/BackEnd/Data/Generated/simulation_results_rs3/



100%|██████████| 71250/71250 [00:02<00:00, 29512.27it/s]
100%|██████████| 439/439 [00:01<00:00, 253.10it/s]
100%|██████████| 4/4 [00:37<00:00,  9.43s/it]

Simulation state 1 saved at /Users/korahughes/Documents/GitHub/DataLife/BackEnd/Data/Generated/simulation_results_rs4/






#  LEGACY CODE (can ignore if you're not a developer)



## Task 1) Load student demographics and preferences

In [59]:
student_df = pd.read_csv(cwd+"/Data/student_info_with_demographics.csv")
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000


In [85]:
student_df.describe()

Unnamed: 0,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,Math_score,ELA_score,fake_gpa,seat,screen
count,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0,71250.0
mean,0.218709,0.753839,0.490021,0.13673,0.713503,0.225937,0.426765,0.011775,0.147804,0.173389,0.011663,0.002667,2.64838,2.976645,2.509633,1.994063,2.995116
std,0.413373,0.430777,0.499904,0.343564,0.188389,0.418201,0.494611,0.107875,0.354908,0.378586,0.107365,0.051571,0.902626,0.869921,0.638444,0.81544,1.411225
min,0.0,0.0,0.0,0.0,0.136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.054795,1.2,1.0,1.0
25%,0.0,1.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.893333,2.125,1.97,1.0,2.0
50%,0.0,1.0,0.0,0.0,0.755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.384615,3.0,2.47,2.0,3.0
75%,0.0,1.0,1.0,0.0,0.869,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.416667,3.875,2.97,3.0,4.0
max,1.0,1.0,1.0,1.0,0.95,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.5,4.5,4.0,3.0,5.0


In [8]:
ranking_df = pd.read_csv(cwd + "/Data/student_prefs.csv")
ranking_df

Unnamed: 0,Student_Id,School,Rank,Rating
0,student_68963,02M411,0,1.000000
1,student_68963,02M376,1,0.500000
2,student_68963,02M316,2,0.333333
3,student_68963,02M438,3,0.250000
4,student_68963,01M448,4,0.200000
...,...,...,...,...
492363,student_36332,02M414,2,0.333333
492364,student_36332,20K490,3,0.250000
492365,student_36332,18K637,4,0.200000
492366,student_36332,28Q686,5,0.166667


In [13]:
student_counter = dict([[sid, 0] for sid in ranking_df["Student_Id"].unique()])  # get number of ranked schools for each student
for i, row in ranking_df.iterrows():
    student_counter[row["Student_Id"]] += 1

num_rank_student = pd.DataFrame(student_counter.items())

buckets = dict([[place+1, 0] for place in range(12)])  # get the number of students with rank x
for sid, count in student_counter.items():
    buckets[count] += 1
    
num_rank = pd.DataFrame(buckets.items())
num_rank

Unnamed: 0,0,1
0,1,861
1,2,1473
2,3,3023
3,4,4859
4,5,7525
5,6,10329
6,7,12771
7,8,12974
8,9,9938
9,10,5159


In [14]:
import plotly.express as px
fig = px.bar(num_rank)
fig.show()

## Task 2) Generate & Export Student Preferences
> Note: decide how many schools each student should rank and which preference

- Case 1: full random with no weight
- *Case 2: full random, ordered by popularity buckets
- Case 3: popularity_bucket-weighted randomness
- *Case 4: popularity_bucket-weighted randomness, ordered by popularity buckets

In [61]:
student_ids = list(ranking_df["Student_Id"].unique())  # get student ids
if testing:  # subset of data for testing
    student_ids = student_ids[:200]

# GET NUMBER OF SCHOOLS RANKED PER STUDENTS
full_list = True  # **students rank 12 schools, ELSE students rank schools based on real-world distribution
student_school_counts = {}  # student_id: #schools to choose
max_num_schools = 12  # maximum number of rankings a student can have
if full_list:
    student_school_counts = dict([[sid, max_num_schools] for sid in student_ids])
else:
    student_school_counts = dict([[sid, 0] for sid in student_ids])
    for i, row in tqdm(ranking_df.iterrows()):  # for each ranking, add 1 to student
        assert row["Student_Id"] in student_school_counts, row["Student_Id"]
        student_school_counts[row["Student_Id"]] += 1

# show descriptive stats (sanity check)
pd.DataFrame(student_school_counts.values()).describe()

Unnamed: 0,0
count,71250.0
mean,12.0
std,0.0
min,12.0
25%,12.0
50%,12.0
75%,12.0
max,12.0


In [62]:
school_ids = list(ranking_df["School"].unique())

# MAKING STUDENT_RANKINGS STRUCTURE
unknown_val = "nan"  # placeholder for no rankings
student_rankings = dict([ [sid, [unknown_val for i in range(max_num_schools)]] for sid in student_ids])  # initializing dict

for sid in tqdm(student_rankings.keys()):
    # CASE 1: RANDOM IMPLEMENTATION
    student_rankings[sid] = np.random.choice(school_ids, size=student_school_counts[sid], replace=False).tolist()
print("\nExample Student Ranking\n", student_rankings[np.random.choice(student_ids)])

100%|███████████████████████████████████████| 71250/71250 [00:06<00:00, 11153.44it/s]


Example Student Ranking
 ['27Q309', '27Q475', '02M416', '29Q272', '05M670', '17K543', '10X442', '14K610', '11X275', '12X251', '06M348', '23K697']





In [63]:
# stats for sanity check
rank_avg = np.mean([len(rank) for sid, rank in student_rankings.items()])
print("Current average number of schools ranked per student:", rank_avg)

Current average number of schools ranked per student: 12.0


In [64]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage1.npy", student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage1.npy")

saved at /Data/Generated/student_rankings_stage1.npy


### Subtask) Deriving School Popularity via True Applicant Rate

In [65]:
""" Data: https://github.com/KoraSHughes/DataLife/blob/main/BackEnd/Data/school_directory.xlsx
DV-EF = applicants
EG-ER = applicant/seats
MN-MY = total seats

TODO parse csv, used tommaso's structure - derive popularity for stage 2
TODO: CHECK ARE THE MOST POPULAR SCHOOLS GETTING THE MOST APPLICATIONS (num applicants) -- sanity check for simulation results
"""
# placeholder data
school_to_pop = dict([[sid, random.randint(1, 5)] for sid in school_ids])
school_to_pop  # Note: dict is more efficient than dataframe for sorting

{'02M411': 5,
 '02M376': 4,
 '02M316': 3,
 '02M438': 5,
 '01M448': 4,
 '02M298': 4,
 '03M402': 3,
 '01M696': 5,
 '02M412': 4,
 '02M416': 1,
 '02M294': 4,
 '01M292': 4,
 '25Q281': 3,
 '02M543': 5,
 '02M418': 4,
 '02M422': 1,
 '05M362': 3,
 '02M551': 3,
 '02M414': 2,
 '02M439': 4,
 '04M435': 5,
 '02M288': 5,
 '01M539': 2,
 '01M450': 2,
 '03M541': 5,
 '02M605': 2,
 '02M400': 3,
 '02M615': 1,
 '02M600': 1,
 '32K168': 5,
 '03M479': 4,
 '02M407': 1,
 '02M139': 2,
 '02M545': 3,
 '27Q309': 5,
 '02M282': 2,
 '02M308': 2,
 '15K684': 3,
 '02M413': 5,
 '07X625': 4,
 '30Q580': 5,
 '02M580': 2,
 '02M305': 3,
 '24Q560': 5,
 '13K605': 4,
 '02M534': 5,
 '02M135': 5,
 '21K690': 2,
 '06M540': 2,
 '02M489': 4,
 '02M392': 3,
 '01M515': 1,
 '06M346': 3,
 '04M555': 1,
 '27Q260': 3,
 '02M374': 1,
 '02M630': 1,
 '02M546': 5,
 '22K555': 2,
 '04M610': 5,
 '10X439': 1,
 '05M369': 4,
 '24Q299': 1,
 '20K445': 1,
 '02M419': 3,
 '02M399': 1,
 '02M533': 3,
 '02M260': 3,
 '13K439': 2,
 '32K556': 4,
 '13K527': 4,
 '13K3

In [66]:
# save generated popularity
translation = [[sid, pop] for sid, pop in school_to_pop.items()]
np.save(cwd+"/Data/Generated/school_demographics.npy", pd.DataFrame(translation, columns=["Name", "Popularity"]))
print("saved at", "/Data/Generated/school_demographics.npy")

saved at /Data/Generated/school_demographics.npy


In [67]:
# add popularity-ordering to ranking
s2_student_rankings = {}
for sid, ranks in tqdm(student_rankings.items()):
    new_ranks = sorted(ranks, key=lambda x: school_to_pop[x], reverse=True)  # sort by dict vals
    s2_student_rankings[sid] = new_ranks  # append new rank
s2_student_rankings[np.random.choice(student_ids)]

100%|██████████████████████████████████████| 71250/71250 [00:00<00:00, 151916.78it/s]


['17K543',
 '18K567',
 '23K697',
 '03M492',
 '10X524',
 '17K539',
 '06M462',
 '11X509',
 '02M533',
 '02M308',
 '03M859',
 '07X334']

In [68]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage2.npy", s2_student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage2.npy")

saved at /Data/Generated/student_rankings_stage2.npy


In [69]:
# add popularity weighted choice
s3_student_rankings = {}
weighted_schools = []
for school, prio in tqdm(school_to_pop.items()):  # generate school list weighted by popularity
    for i in range(prio**2):
        weighted_schools.append(school)
print("Note: number of schools is", len(weighted_schools), "which should be <", len(school_to_pop), "* 5")
        
for sid, ranks in tqdm(student_rankings.items()):  # choose from weighted list
    new_ranks = np.random.choice(weighted_schools, size=student_school_counts[sid], replace=False).tolist()
    s3_student_rankings[sid] = new_ranks
    
s3_student_rankings[np.random.choice(student_ids)]

100%|██████████████████████████████████████████| 437/437 [00:00<00:00, 385957.22it/s]


Note: number of schools is 4913 which should be < 437 * 5


100%|█████████████████████████████████████████| 71250/71250 [01:19<00:00, 896.52it/s]


['07X548',
 '18K617',
 '27Q324',
 '05M304',
 '08X282',
 '14K477',
 '11X513',
 '11X299',
 '19K409',
 '15K463',
 '11X514',
 '13K439']

In [70]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage3.npy", s3_student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage3.npy")

saved at /Data/Generated/student_rankings_stage3.npy


In [71]:
# add popularity weighted choice AND popularity-ordering to ranking
s4_student_rankings = {}
for sid, ranks in tqdm(s3_student_rankings.items()):
    new_ranks = sorted(ranks, key=lambda x: school_to_pop[x], reverse=True)  # sort by dict vals
    s4_student_rankings[sid] = new_ranks  # append new rank
    
s4_student_rankings[np.random.choice(student_ids)]

100%|██████████████████████████████████████| 71250/71250 [00:00<00:00, 188950.53it/s]


['27Q314',
 '17K408',
 '16K498',
 '02M534',
 '27Q309',
 '31R460',
 '09X327',
 '10X437',
 '27Q400',
 '13K419',
 '12X251',
 '02M308']

In [72]:
# saving student rankings
np.save(cwd+"/Data/Generated/student_rankings_stage4.npy", s4_student_rankings)
print("saved at", "/Data/Generated/student_rankings_stage4.npy")

saved at /Data/Generated/student_rankings_stage4.npy


In [73]:
# adding lotterly number
lnums = [uuid.uuid4().hex for i in range(len(student_df))]
student_df["Lottery"] = lnums
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,White,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,0,0,0,0,30Q127,2.000000,3.125000,4c4ef64abb8044588343460f6eeab8db
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,0,0,0,0,27Q137,2.153846,2.750000,57835b4ba9a44375be70514c28639cee
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,0,0,0,0,24Q061,2.846154,1.753425,359e444aef7344ccb33f871a762360e8
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,1,0,0,0,02M114,4.275862,4.181818,536468d353324cb08da75344b7f0d1e8
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,1,0,0,0,02M312,4.017241,3.125000,c195ec0c32e4452ca62a122327f7bfa6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,0,0,0,0,01M450,2.000000,1.972603,1eeb5307efa84090ba4f2a9eb2dcbb4b
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,0,1,0,0,01M184,3.000000,3.125000,5c810ec67fbe444cb1e9629751879cbc
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,0,0,0,0,01M539,2.615385,4.030303,fd4f0ebe302b47459d04352e35e92b49
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,0,1,0,0,01M188,3.083333,2.000000,9fa7ba94beda4bcfbfafbc2cad29db52


In [74]:
# adding gpa, seat, and screen attributes

def add_fake_gpa(students):
    """ adds a gpa attribute based on the students ELA and Math scores """
    students["fake_gpa"] = students["Math_score"] + students["ELA_score"]
    max_score = max(students["fake_gpa"])
    students["fake_gpa"] = students["fake_gpa"].apply(lambda score: round(4*score/max_score, 2))  # gpa-normalized
    return students
    
def add_edopt(students):
    """ adds gpa percentile in 3rds for ed opt schools """
    lower = students["fake_gpa"].quantile(1/3)
    upper = students["fake_gpa"].quantile(2/3)
    seated = lambda x: 1 if x >= upper else (2 if x >= lower else 3)
    students["seat"] = students["fake_gpa"].apply(seated)
    return students

def add_screen(students):
    """ adds gpa percentile in 5ths for screen schools """
    lower1 = students["fake_gpa"].quantile(1/5)
    lower2 = students["fake_gpa"].quantile(2/5)
    upper1 = students["fake_gpa"].quantile(3/5)
    upper2 = students["fake_gpa"].quantile(4/5)
    def screened(gpa):
        if gpa >= upper2:
            return 1
        elif gpa >= upper1:
            return 2
        elif gpa >= lower2:
            return 3
        elif gpa >= lower1:
            return 4
        else:
            return 5
        
    students["screen"] = students["fake_gpa"].apply(lambda x: screened(x))
    return students

student_df = add_fake_gpa(student_df)
student_df = add_edopt(student_df)
student_df = add_screen(student_df)
student_df

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,...,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery,fake_gpa,seat,screen
0,student_36332,Residential District Unknown,0,0,1,0,0.723,0,1,0,...,0,0,0,30Q127,2.000000,3.125000,4c4ef64abb8044588343460f6eeab8db,2.29,2,3
1,student_36144,Residential District Unknown,0,1,0,0,0.684,0,1,0,...,0,0,0,27Q137,2.153846,2.750000,57835b4ba9a44375be70514c28639cee,2.19,2,4
2,student_37038,Residential District Unknown,1,1,1,0,0.881,0,1,0,...,0,0,0,24Q061,2.846154,1.753425,359e444aef7344ccb33f871a762360e8,2.05,3,4
3,student_614,Residential District Unknown,0,0,1,0,0.191,0,0,0,...,0,0,0,02M114,4.275862,4.181818,536468d353324cb08da75344b7f0d1e8,3.77,1,1
4,student_21981,Residential District Unknown,1,0,1,0,0.304,0,0,0,...,0,0,0,02M312,4.017241,3.125000,c195ec0c32e4452ca62a122327f7bfa6,3.19,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71245,student_22462,Residential District 01,0,0,1,0,0.650,0,1,0,...,0,0,0,01M450,2.000000,1.972603,1eeb5307efa84090ba4f2a9eb2dcbb4b,1.77,3,5
71246,student_30857,Residential District 01,1,0,1,0,0.621,0,0,0,...,1,0,0,01M184,3.000000,3.125000,5c810ec67fbe444cb1e9629751879cbc,2.73,2,2
71247,student_57833,Residential District 01,0,0,0,0,0.256,1,0,0,...,0,0,0,01M539,2.615385,4.030303,fd4f0ebe302b47459d04352e35e92b49,2.96,1,2
71248,student_32259,Residential District 01,0,1,0,0,0.950,0,0,0,...,1,0,0,01M188,3.083333,2.000000,9fa7ba94beda4bcfbfafbc2cad29db52,2.27,2,3


In [75]:
# saving school demographics with updated attributes
np.save(cwd+"/Data/Generated/student_demographics.npy", student_df)
print("saved at", "/Data/Generated/student_demographics.npy")

saved at /Data/Generated/student_demographics.npy


## Step 3.5) Add profiles for school rankings

In [76]:
schools_criterion = ["District", "Lottery", "ELA_Score", "Math_Score", "Poverty_Index", "fake_gpa", "seat", "screen"]

# BASIC SCHOOL PROFILES (intuition)
# Note: these weights are in powers of 10 so they are closer to discrete ordering than pure ratios
district_school = [3, 1, 0, 0, 0, 0, 0, 0]
balanced_school = [3, 1, 2, 2, 1, 0, 0, 0]
stem_school =     [3, 1, 2, 5, 1, 0, 0, 0]  # Note: most schools weight ELA and math equivalently (less realistic)
lottery_school =  [1, 5, 0, 0, 0, 0, 0, 0]
elite_school =    [0, 1, 4, 4, 0, 0, 0, 0]
basic_school_profiles = [district_school, balanced_school, stem_school, lottery_school, elite_school]

def basic_rand_pref():
    """ generate a random school preference profile """
    return random.choice(basic_school_profiles)

In [77]:
"""
Open schools: exlcusively lottery
Ed Opt schools: 1/3rd seat for high/medium/low gpa (known) + lottery for inter-group tiebreaker
Screen schools: 5 group distribution of gpa but highest to lowest (known) + lottery tie breaker
Audition/Assessment: some sort of examination with ranking (school provides a score) + lottery tiebreaker

Note: most have set-asides & zoning regulations
Note: careful of skew (students with high seats disproportionally apply more to more popular schools)
# Open, Ed Opt, Screened, Audtion/Assessment
"""
#  ["District", "Lottery", "ELA_Score", "Math_Score", "Poverty_Index", "gpa", "seat", "screen"]
open_school = ["Lottery"]  # [0, 1, 0, 0, 0, 0, 0, 0]
edopt_school = ["seat", "Lottery"]  # [0, 1, 0, 0, 0, 0, 2, 0]
screen_school = ["screen", "Lottery"]  # [0, 0, 0, 0, 0, 0, 0, 1]
# audition_school = [0, 1, 0, 0, 0, 0, 0]
complex_school_profiles = [open_school, edopt_school, screen_school]

student_df.sort_values(edopt_school, ascending=True)

Unnamed: 0,Student_Id,Residential_District,swd,poverty,sex,ell,ENI,Black,Hispanic,Multi-Racial,...,Asian,Native American,Missing Race/Ethnicity Data,school,Math_score,ELA_score,Lottery,fake_gpa,seat,screen
32359,student_45916,Residential District 21,0,1,1,0,0.403,0,0,0,...,1,0,0,21K239,3.916667,3.125000,0004a77607bc4e3482b0fe742bd2a1b7,3.14,1,1
16358,student_36399,Residential District 27,0,0,1,0,0.823,0,1,0,...,0,0,0,27Q282,4.379310,2.437500,00089b79bdc048f9bb0208a647356301,3.04,1,2
69101,student_34762,Residential District 02,0,0,0,0,0.265,0,0,1,...,0,0,0,02M167,4.172414,4.121212,000accfcd3e4424486a67561d74c41c5,3.70,1,1
4502,student_15957,Residential District 31,0,1,0,0,0.515,0,0,0,...,1,0,0,31R072,3.333333,3.437500,001a96ce38234040a179dd93e1a57e7c,3.02,1,2
3423,student_34553,Residential District 31,0,1,0,0,0.450,0,0,0,...,0,0,0,31R048,4.068966,4.287879,001b393e98e24bcca34c7d9b22a5a41f,3.73,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59982,student_16145,Residential District 08,1,0,1,0,0.912,0,1,0,...,0,0,0,08X376,1.933333,1.986301,fff745d3ceab4f1d9ced363f33275c01,1.75,3,5
41853,student_67379,Residential District 17,1,1,0,0,0.834,1,0,0,...,0,0,0,17K246,1.413333,1.904110,fff7d4e0921b41a288846c54d8779b7e,1.48,3,5
37918,student_64944,Residential District 20,0,1,1,1,0.792,0,0,0,...,1,0,0,20K227,1.666667,1.821918,fff96412729044f786afa02c3e4885e2,1.56,3,5
59660,student_44321,Residential District 09,0,1,0,1,0.947,0,1,0,...,0,0,0,09X313,1.653333,1.849315,fffc9ef37fff4dd6b03266b40e221006,1.56,3,5


In [78]:
%%time
students_open = student_df.sort_values(open_school, ascending=True)["Student_Id"].tolist()
students_edopt = student_df.sort_values(edopt_school, ascending=True)["Student_Id"].tolist()
students_screen = student_df.sort_values(screen_school, ascending=True)["Student_Id"].tolist()
student_ordering_choices = [students_open, students_edopt, students_screen]

def gen_school_choices(ids, choice_weights):
    assert len(choice_weights) == len(student_ordering_choices), "choices should follow schema:"+str(len(student_ordering_choices))
    weighted_choices = ""  # creates a string representing indicies of the choices
    for i, weight in enumerate(choice_weights):
        weighted_choices += str(i)*weight
            
    school_choices = {}  # generate student choices
    for sid in ids:
        choice_type = int(weighted_choices[np.random.randint(len(weighted_choices))])
        school_choices[sid] = student_ordering_choices[choice_type]  # add corresponding choice
    return school_choices

CPU times: user 235 ms, sys: 15.2 ms, total: 251 ms
Wall time: 257 ms


In [None]:
# TODO: ensure order by discrete --> think SQL 'order_by' (next attribute is only looked at for tie-breakers)
# prioirity students always take presidence
# Note: recall you can affect capacity of school
# TODO: add student preferences proportional to school policy
# TODO: add distance from school's popularity
# TODO: take a better look at student preference generation
# TODO: add school district
# get_district = dict([[sid, random.choice(student_df["Residential_District"].unique())] for sid in school_ids])
# get_district[random.choice(school_ids)]

# Next steps: make one-shot function & seperate python file 

## Task 3: Generate School Rankings from profiles
### Subgoal) Create randomized school profiles
- Case 1: all open school (lottery only)
- Case 2: add screen and or ed opt policies
- Case 3: add policy-weighted randomness for variation between schools

In [79]:
%%time
# generate profile
all_open = gen_school_choices(school_ids, [1,0,0])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_open.npy", all_open)
print("saved at", "/Data/Generated/school_rankings_open.npy")

saved at /Data/Generated/school_rankings_open.npy
CPU times: user 10.1 ms, sys: 2.21 ms, total: 12.4 ms
Wall time: 11.5 ms


In [80]:
%%time
# generate profile
all_edopt = gen_school_choices(school_ids, [0,1,0])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_edopt.npy", all_edopt)
print("saved at", "/Data/Generated/school_rankings_edopt.npy")

saved at /Data/Generated/school_rankings_edopt.npy
CPU times: user 10 ms, sys: 1.8 ms, total: 11.8 ms
Wall time: 10.4 ms


In [81]:
%%time
# generate profile
all_screen = gen_school_choices(school_ids, [0,0,1])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_screen.npy", all_screen)
print("saved at", "/Data/Generated/school_rankings_screen.npy")

saved at /Data/Generated/school_rankings_screen.npy
CPU times: user 9.86 ms, sys: 1.73 ms, total: 11.6 ms
Wall time: 10.5 ms


In [82]:
%%time
# generate profile
all_combo = gen_school_choices(school_ids, [1,1,1])
# saving school rankings
np.save(cwd+"/Data/Generated/school_rankings_combo.npy", all_combo)
print("saved at", "/Data/Generated/school_rankings_combo.npy")

# NOTE: EVERYTHING <400ms

saved at /Data/Generated/school_rankings_combo.npy
CPU times: user 17.8 ms, sys: 2.74 ms, total: 20.5 ms
Wall time: 19.4 ms


In [83]:
print(len(student_df), "students ranked alongside", len(all_combo), "schools")

71250 students ranked alongside 437 schools


In [None]:
def insert_student(student_data, df):
    """ insert a student into a ranked category """
    # TODO: find a more efficient way of doing this
    return ...

In [None]:
# TODO collinearity matrix between ranking and student attributes like test scores/lottery number
# TODO: add gale-shapley priority => 'set asside'

# Evaluate Data

## NYC 2023 Schools' Data

In [6]:
# looking at real admissions data
school_info = np.load("Data/schools_info.npy", allow_pickle=True).item()
# {name: (dbn, capacity, true applicants, total applicants)}
temp = []
for name, data in school_info.items():  # transfer to df
    temp.append([name]+[d for d in data])
school_df = pd.DataFrame(temp, columns=["name", "dbn", "capacity", "true_applicants", "total_applicants"])
like = []
pop = []
for i, row in school_df.iterrows():
    if row["capacity"]==0:
        pop.append(0)
    else:
        pop.append(row["total_applicants"] / row["capacity"])
    if row["total_applicants"]==0:
        like.append(0)
    else:
        like.append(row["true_applicants"] / row["total_applicants"])
school_df["likeability"] = like   # probability that this school is the student's first choice/high on their list
school_df["popularity"] = pop  # expected number of applicants per capacity slot
# Note: true applicants not found yet
school_df

Unnamed: 0,name,dbn,capacity,true_applicants,total_applicants,likeability,popularity
0,Orchard Collegiate Academy,01M292,90,95,361,0.263158,4.011111
1,University Neighborhood High School,01M448,105,314,1625,0.193231,15.476190
2,East Side Community School,01M450,95,333,1011,0.329377,10.642105
3,Forsyth Satellite Academy,01M458,28,6,59,0.101695,2.107143
4,Lower East Side Preparatory High School,01M515,60,28,173,0.161850,2.883333
...,...,...,...,...,...,...,...
434,EBC High School for Public Service - Bushwick,32K545,126,107,534,0.200375,4.238095
435,The Brooklyn School for Social Justice,32K549,115,50,357,0.140056,3.104348
436,The Academy of Urban Planning and Engineering,32K552,113,171,922,0.185466,8.159292
437,All City Leadership Secondary School,32K554,62,291,963,0.302181,15.532258


In [7]:
school_df.describe()

Unnamed: 0,capacity,true_applicants,total_applicants,likeability,popularity
count,439.0,439.0,439.0,439.0,439.0
mean,166.191344,368.667426,1119.630979,0.252079,7.220008
std,167.110769,706.236862,1396.186968,0.216636,7.510483
min,10.0,1.0,1.0,0.025641,0.011111
25%,92.5,63.0,375.5,0.157789,3.02
50%,108.0,128.0,652.0,0.212679,5.109589
75%,150.0,308.5,1211.0,0.302605,8.26387
max,1188.0,5863.0,10400.0,4.0,62.528


In [37]:
import plotly.express as px

px.scatter(school_df, x="likeability", y="popularity", color="capacity")
# px.scatter(school_df, x="total_applicants", y="popularity", color="capacity")

In [None]:
# feedback from a while ago:
# true applicant rate
# stage 0: capacity-weighted popularity
# making the mean lottery number values into medians
# making y-scale of graphs of stage 1&3 and 2&4
# revise intro slide description (not true applicant "rate", not changing matching algorithm, "ranking" --> "matching")

""" LONGTERM NOTES:
students in the same district will have similar patterns -- higher preference for ranking schools in their own districts or close to accesisble subway lines
clustering based on applicants per district
"""