In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import subprocess


def get_git_root():
    """Get the root directory of the git repository"""
    try:
        git_root = (
            subprocess.check_output(
                ["git", "rev-parse", "--show-toplevel"], stderr=subprocess.DEVNULL
            )
            .strip()
            .decode("utf-8")
        )
        return git_root
    except (subprocess.CalledProcessError, FileNotFoundError):
        return None


# Change to git root
git_root = get_git_root()
if git_root:
    os.chdir(git_root)
    print(f"Changed to git root: {os.getcwd()}")
else:
    print("Not in a git repository or git not found")

Changed to git root: /Users/sherman/GitHub/CSE-6748


In [3]:
import pickle

import numpy as np
import pandas as pd

import src.features.illuminate as illuminate
import src.features.index as index
import src.features.milestone as milestone
import src.features.rit as rit
import src.features.student as student
from src.feasy.sparkle import Sparkle
from src.utils import create_student_features, get_feature_functions_from_module

pd.set_option("display.max_columns", None)
pd.set_option("future.no_silent_downcasting", True)
pd.set_option("mode.copy_on_write", True)

In [4]:
student_profile = pd.read_pickle("data/cleaned/student_profile_dataset.pkl")
student_profile.head()

Unnamed: 0,mask_studentpersonkey,GradeLevel,CurrentEnrollment,FinalPrimaryEnrollmentForYearFlag,RetainedFlag,EnrollmentReasonDesc,WithDrawalReasonDesc,FirstDayOfSchoolYear,SchoolYearNumberFall,SchoolYearNumberSpring,SchoolStatusDesc,SchoolDetailReportName,SchoolDetailLevelDesc,SchoolStatusDescNextYear,SchoolDetailReportNameNextYear,SchoolDetailLevelDescNextYear,ActiveGiftedStudentResultRecordFlag,grad,milestone,illuminate,sat,rit
2,107055,6,N,Y,N,Transferred from another GA district,Year End,2023-07-19,2023,2024,Open,Woodland Middle,Middle School,Open,Woodland Middle,Middle School,,,NestedData(SubjectDesc=['English Language Arts...,NestedData(responsedatevalue=[datetime.date(20...,,
3,114271,12,N,Y,Y,Continuing in same school,High School Graduation,2021-07-24,2021,2022,Open,Johns Creek High,High School,,,,,,,,,
4,117427,12,N,Y,N,Continuing in same school,High School Graduation,2021-07-24,2021,2022,Open,Northview High,High School,,,,,,,,,
5,117773,12,N,Y,N,Continuing in same school,Court Order or Legal Requirement,2021-07-24,2021,2022,Open,Centennial High,High School,,,,,,,,,
6,117995,12,N,Y,N,Continuing in same school,High School Graduation,2021-07-24,2021,2022,Open,Roswell High,High School,,,,,,,,,


In [5]:
sat_uncertainty = pd.read_parquet("data/cleaned/sat_uncertainty.parquet")
sat_uncertainty = sat_uncertainty[sat_uncertainty["std_level"] == 1.0]

useful_cols = [
    "segment",
    "std_level",
    "math_lower_threshold",
    "math_upper_threshold",
    "verbal_lower_threshold",
    "verbal_upper_threshold",
]

sat_uncertainty = sat_uncertainty[useful_cols]
sat_uncertainty.head()

Unnamed: 0,segment,std_level,math_lower_threshold,math_upper_threshold,verbal_lower_threshold,verbal_upper_threshold
1,RIT+ILL+MS,1.0,-59.8727,60.9763,-52.2708,54.581
7,RIT,1.0,-50.6868,58.7162,-54.6188,77.7364
13,RIT+MS,1.0,-66.599,61.9324,-47.4495,58.6915
19,RIT+ILL,1.0,-74.3577,103.5342,-93.4608,78.8336
25,ILL+MS,1.0,-83.9609,92.9813,-45.0,46.5888


In [6]:
# fmt:off
current_enrolment = student_profile[student_profile["CurrentEnrollment"] == "Y"]
current_students = current_enrolment["mask_studentpersonkey"].unique()
current_cohort = student_profile[student_profile["mask_studentpersonkey"].isin(current_students)]
# fmt:on
current_cohort.head()

Unnamed: 0,mask_studentpersonkey,GradeLevel,CurrentEnrollment,FinalPrimaryEnrollmentForYearFlag,RetainedFlag,EnrollmentReasonDesc,WithDrawalReasonDesc,FirstDayOfSchoolYear,SchoolYearNumberFall,SchoolYearNumberSpring,SchoolStatusDesc,SchoolDetailReportName,SchoolDetailLevelDesc,SchoolStatusDescNextYear,SchoolDetailReportNameNextYear,SchoolDetailLevelDescNextYear,ActiveGiftedStudentResultRecordFlag,grad,milestone,illuminate,sat,rit
47,1388557,KK,N,Y,N,Never attended school before,Year End,2022-07-21,2022,2023,Open,FAST Fulton Academy of Science and Technology ES,Elementary School,Open,FAST Fulton Academy of Science and Technology ES,Elementary School,Y,,,,,
48,1388557,1,N,Y,N,Continuing in same school,Year End,2023-07-19,2023,2024,Open,FAST Fulton Academy of Science and Technology ES,Elementary School,Open,FAST Fulton Academy of Science and Technology ES,Elementary School,,,,,,
49,1388557,2,Y,Y,N,Continuing in same school,Year End,2024-07-19,2024,2025,Open,FAST Fulton Academy of Science and Technology ES,Elementary School,Open,FAST Fulton Academy of Science and Technology ES,Elementary School,,,,,,
51,1389182,9,N,Y,N,Transferred from private school,Year End,2022-07-21,2022,2023,Open,Johns Creek High,High School,Open,Johns Creek High,High School,,"NestedData(SubjectArea=['Electives', 'ELA', 'W...","NestedData(SubjectDesc=['Biology'], TestingDat...",,,"NestedData(TestDateId=['11700', '11700', '1184..."
52,1389182,10,N,Y,N,Continuing in same school,Year End,2023-07-19,2023,2024,Open,Johns Creek High,High School,Open,Johns Creek High,High School,,,,,"NestedData(MathScore=[500], VerbalScore=[480],...","NestedData(TestDateId=['12068', '12069'], Test..."


In [7]:
current_cohort_with_sat = current_cohort[~current_cohort["sat"].isna()]
current_cohort_no_sat = current_cohort[current_cohort["sat"].isna()]

In [8]:
students_with_existing_sat = (
    current_cohort_with_sat.sort_values(
        ["mask_studentpersonkey", "SchoolYearNumberFall"],
        ascending=True,
    )
    .groupby(["mask_studentpersonkey"])
    .last()
    .reset_index()
    .filter(["mask_studentpersonkey", "SchoolYearNumberFall", "sat"])
    .rename(columns={"sat": "latest_sat"})
)

students_with_existing_sat

Unnamed: 0,mask_studentpersonkey,SchoolYearNumberFall,latest_sat
0,1389182,2024,"NestedData(MathScore=[550], VerbalScore=[600],..."
1,1390795,2024,"NestedData(MathScore=[560], VerbalScore=[550],..."
2,1392398,2024,"NestedData(MathScore=[610], VerbalScore=[680],..."
3,1395580,2022,"NestedData(MathScore=[640], VerbalScore=[690],..."
4,1406902,2024,"NestedData(MathScore=[560], VerbalScore=[530],..."
...,...,...,...
6945,863906,2024,"NestedData(MathScore=[720], VerbalScore=[640],..."
6946,866919,2024,"NestedData(MathScore=[580, 690], VerbalScore=[..."
6947,866923,2024,"NestedData(MathScore=[660], VerbalScore=[700],..."
6948,869947,2023,"NestedData(MathScore=[600], VerbalScore=[550],..."


In [None]:
latest_current_cohort = (
    current_cohort.sort_values(
        ["mask_studentpersonkey", "SchoolYearNumberFall"],
        ascending=True,
    )
    .groupby(["mask_studentpersonkey"])
    .last()
    .reset_index()
)

sat_data = (
    pd.merge(
        latest_current_cohort,
        students_with_existing_sat,
        on=["mask_studentpersonkey", "SchoolYearNumberFall"],
        how="left",
        validate="1:1",
    )
    .merge(
        students_with_existing_sat.rename(
            columns={
                "latest_sat": "previous_sat",
                "SchoolYearNumberFall": "previous_sat_year",
            }
        ),
        on="mask_studentpersonkey",
        how="left",
        validate="1:1",
    )
    .filter(
        ["mask_studentpersonkey", "latest_sat", "previous_sat", "previous_sat_year"]
    )
)
# fmt:off
sat_data["latest_sat_math_score"] = sat_data["latest_sat"].apply(lambda x: x.MathScore[-1] if not pd.isna(x) else None)
sat_data["latest_sat_verbal_score"] = sat_data["latest_sat"].apply(lambda x: x.VerbalScore[-1] if not pd.isna(x) else None)
sat_data["previous_sat_math_score"] = sat_data["previous_sat"].apply(lambda x: x.MathScore[-1] if not pd.isna(x) else None)
sat_data["previous_sat_verbal_score"] = sat_data["previous_sat"].apply(lambda x: x.VerbalScore[-1] if not pd.isna(x) else None)
sat_data = sat_data.fillna(-1)
sat_data = sat_data.drop(columns=["latest_sat", "previous_sat"]).astype(int)
sat_data["mask_studentpersonkey"] = sat_data["mask_studentpersonkey"].astype(str)
# fmt:on
sat_data.head()

Unnamed: 0,mask_studentpersonkey,previous_sat_year,latest_sat_math_score,latest_sat_verbal_score,previous_sat_math_score,previous_sat_verbal_score
0,1388557,-1,-1,-1,-1,-1
1,1389182,2024,550,600,550,600
2,1389218,-1,-1,-1,-1,-1
3,1389220,-1,-1,-1,-1,-1
4,1389221,-1,-1,-1,-1,-1


In [None]:
# Load the model
with open("models/catboost_model.pkl", "rb") as f:
    model = pickle.load(f)

print("Model loaded successfully")

Model loaded successfully


In [11]:
index_feat_funcs = get_feature_functions_from_module(index)
student_feat_funcs = get_feature_functions_from_module(student)
illuminate_feat_funcs = get_feature_functions_from_module(illuminate)
milestone_feat_funcs = get_feature_functions_from_module(milestone)
rit_feat_funcs = get_feature_functions_from_module(rit)

features = (
    index_feat_funcs
    + student_feat_funcs
    + illuminate_feat_funcs
    + milestone_feat_funcs
    + rit_feat_funcs
)

In [None]:
latest_year_per_student = (
    current_cohort.sort_values(
        ["mask_studentpersonkey", "SchoolYearNumberFall"],
        ascending=True,
    )
    .groupby("mask_studentpersonkey")
    .last()
    .reset_index()
)

has_illuminate = latest_year_per_student["illuminate"].notna()
has_milestone = latest_year_per_student["milestone"].notna()
has_rit = latest_year_per_student["rit"].notna()
has_assessments = has_rit | has_rit | has_rit

current_cohort_with_assessments = latest_year_per_student[has_assessments]
current_cohort_with_assessments["has_illuminate"] = has_illuminate
current_cohort_with_assessments["has_milestone"] = has_milestone
current_cohort_with_assessments["has_rit"] = has_rit
current_cohort_with_assessments.head()

Unnamed: 0,mask_studentpersonkey,GradeLevel,CurrentEnrollment,FinalPrimaryEnrollmentForYearFlag,RetainedFlag,EnrollmentReasonDesc,WithDrawalReasonDesc,FirstDayOfSchoolYear,SchoolYearNumberFall,SchoolYearNumberSpring,SchoolStatusDesc,SchoolDetailReportName,SchoolDetailLevelDesc,SchoolStatusDescNextYear,SchoolDetailReportNameNextYear,SchoolDetailLevelDescNextYear,ActiveGiftedStudentResultRecordFlag,grad,milestone,illuminate,sat,rit,has_illuminate,has_milestone,has_rit
1,1389182,11,Y,Y,N,Continuing in same school,Year End,2024-07-19,2024,2025,Open,Johns Creek High,High School,Open,Johns Creek High,High School,,"NestedData(SubjectArea=['Electives', 'ELA', 'W...","NestedData(SubjectDesc=['Biology'], TestingDat...",,"NestedData(MathScore=[550], VerbalScore=[600],...","NestedData(TestDateId=['12449'], TestDurationM...",False,True,True
23,1389249,3,Y,Y,N,Continuing in same school,Year End,2024-07-19,2024,2025,Open,Wolf Creek Elementary,Elementary School,Open,Wolf Creek Elementary,Elementary School,Y,,,NestedData(responsedatevalue=[datetime.date(20...,,"NestedData(TestDateId=['12103', '12110'], Test...",True,False,True
31,1390724,7,Y,Y,N,Continuing in same school,Year End,2024-07-19,2024,2025,Open,Sandtown Middle,Middle School,Open,Sandtown Middle,Middle School,,,NestedData(SubjectDesc=['English Language Arts...,NestedData(responsedatevalue=[datetime.date(20...,,"NestedData(TestDateId=['12474'], TestDurationM...",True,True,True
35,1390795,11,Y,Y,N,Continuing in same school,Year End,2024-07-19,2024,2025,Open,North Springs High,High School,Open,North Springs High,High School,Y,"NestedData(SubjectArea=['Math', 'ELA', 'Electi...","NestedData(SubjectDesc=['Algebra I', 'Biology'...",NestedData(responsedatevalue=[datetime.date(20...,"NestedData(MathScore=[560], VerbalScore=[550],...","NestedData(TestDateId=['12071', '12071'], Test...",True,True,True
36,1390797,11,Y,Y,N,Continuing in same school,Year End,2024-07-19,2024,2025,Open,North Springs High,High School,Open,North Springs High,High School,,"NestedData(SubjectArea=['Math', 'Social Studie...","NestedData(SubjectDesc=['Biology', 'Algebra I'...",NestedData(responsedatevalue=[datetime.date(20...,,"NestedData(TestDateId=['12069', '12071', '1207...",True,True,True


In [None]:
# fmt:off
students_with_assessments = current_cohort_with_assessments["mask_studentpersonkey"].unique()
sparkle_data = current_cohort[current_cohort["mask_studentpersonkey"].isin(students_with_assessments)]
sparkle_data = sparkle_data[sparkle_data["GradeLevel"].isin(["9", "10", "11", "12"])]
# fmt:on

print(sparkle_data.shape)
sparkle_data.head()

(65330, 22)


Unnamed: 0,mask_studentpersonkey,GradeLevel,CurrentEnrollment,FinalPrimaryEnrollmentForYearFlag,RetainedFlag,EnrollmentReasonDesc,WithDrawalReasonDesc,FirstDayOfSchoolYear,SchoolYearNumberFall,SchoolYearNumberSpring,SchoolStatusDesc,SchoolDetailReportName,SchoolDetailLevelDesc,SchoolStatusDescNextYear,SchoolDetailReportNameNextYear,SchoolDetailLevelDescNextYear,ActiveGiftedStudentResultRecordFlag,grad,milestone,illuminate,sat,rit
51,1389182,9,N,Y,N,Transferred from private school,Year End,2022-07-21,2022,2023,Open,Johns Creek High,High School,Open,Johns Creek High,High School,,"NestedData(SubjectArea=['Electives', 'ELA', 'W...","NestedData(SubjectDesc=['Biology'], TestingDat...",,,"NestedData(TestDateId=['11700', '11700', '1184..."
52,1389182,10,N,Y,N,Continuing in same school,Year End,2023-07-19,2023,2024,Open,Johns Creek High,High School,Open,Johns Creek High,High School,,,,,"NestedData(MathScore=[500], VerbalScore=[480],...","NestedData(TestDateId=['12068', '12069'], Test..."
53,1389182,11,Y,Y,N,Continuing in same school,Year End,2024-07-19,2024,2025,Open,Johns Creek High,High School,Open,Johns Creek High,High School,,,,,"NestedData(MathScore=[550], VerbalScore=[600],...","NestedData(TestDateId=['12449'], TestDurationM..."
199,1390795,9,N,Y,N,Transferred from another GA district,Year End,2022-07-21,2022,2023,Open,North Springs High,High School,Open,North Springs High,High School,Y,"NestedData(SubjectArea=['Math', 'ELA', 'Electi...","NestedData(SubjectDesc=['Algebra I', 'Biology'...",NestedData(responsedatevalue=[datetime.date(20...,,"NestedData(TestDateId=['11698', '11701', '1184..."
200,1390795,10,N,Y,N,Continuing in same school,Year End,2023-07-19,2023,2024,Open,North Springs High,High School,Open,North Springs High,High School,Y,,,,,"NestedData(TestDateId=['12071', '12071'], Test..."


In [None]:
sparkle = Sparkle(features).source(sparkle_data, from_dataframe=True)
features = sparkle.to_pandas()
features.head()

Unnamed: 0,SchoolYearFall,mask_studentpersonkey,RetainedFlag,GradeLevel,is_gifted,grade_mode_numeric,improvement_first_to_last,is_on_sat_grade_level,last_percent_correct,low_mastery_rate_below_50,mastery_rate_above_80,math_improvement_first_to_last,math_low_mastery_rate_below_50,math_mastery_rate_above_80,math_mean_item_pct_correct,math_percent_extension,math_percent_reteach,math_percent_review_practice,math_slope_percent_correct_over_time,math_std_item_pct_correct,math_total_points_earned,math_total_points_possible,math_weighted_percent_correct,mean_item_percent_correct,mean_standard_percent_correct,mean_std_pct_ela,mean_std_pct_math,num_assessments,num_items,percent_extension,percent_reteach,percent_review_practice,slope_percent_correct_over_time,std_item_percent_correct,total_points_earned,total_points_possible,verbal_improvement_first_to_last,verbal_low_mastery_rate_below_50,verbal_mastery_rate_above_80,verbal_mean_item_pct_correct,verbal_percent_extension,verbal_percent_reteach,verbal_percent_review_practice,verbal_slope_percent_correct_over_time,verbal_std_item_pct_correct,verbal_total_points_earned,verbal_total_points_possible,verbal_weighted_percent_correct,weighted_overall_percent_correct,achievement_level_mean_ela,achievement_level_mean_math,lexile_mean_ela,lexile_mean_math,mean_achievement_level_all,mean_lexile_score_all,mean_scale_score_all,num_subjects_tested,num_test_days,num_unique_test_dates,pct_proficient_all,pct_proficient_ela,pct_proficient_math,scale_score_improvement_all,scale_score_improvement_ela,scale_score_improvement_math,scale_score_mean_ela,scale_score_mean_math,scale_score_slope_all,scale_score_slope_ela,scale_score_slope_math,scale_score_std_ela,scale_score_std_math,std_scale_score_all,count_quintile_avg,count_quintile_hiavg,count_quintile_high,count_quintile_low,count_quintile_lowavg,max_rit_score,mean_percent_correct,mean_percentile,mean_rit_score,mean_test_duration,min_rit_score,num_tests,pct_quintile_high,std_percent_correct,std_percentile,std_rit_score,std_test_duration
0,2022,1389182,N,9,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,623.0,1,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,2,0,0,253.0,50.666667,78.666667,242.0,58.666667,232.0,3,0.666667,1.699673,15.369523,8.602325,18.116904
1,2023,1389182,N,10,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,2,0,0,247.0,51.0,82.5,243.0,65.5,239.0,2,1.0,2.0,1.5,4.0,9.5
2,2024,1389182,N,11,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,248.0,51.0,79.0,248.0,70.0,248.0,1,0.0,0.0,0.0,0.0,0.0
3,2022,1390795,N,9,Y,9,11.76,0,100.0,0.235955,0.730337,11.76,0.313433,0.656716,71.757463,0.656716,0.343284,0.0,0.140152,22.851185,912.0,1295.0,0.704247,78.176966,79.138652,98.485,72.786119,89,89,0.730337,0.269663,0.0,0.229855,22.821853,1256.0,1647.0,0.0,0.0,0.954545,97.727273,0.954545,0.045455,0.0,0.0,3.006536,344.0,352.0,0.977273,0.762599,0.0,3.0,0.0,0.0,3.0,0.0,577.5,2,1,1,1.0,0.0,1.0,3.0,0.0,0.0,0.0,576.0,3.0,0.0,0.0,0.0,0.0,1.5,0,2,1,0,0,253.0,53.333333,82.666667,243.333333,82.0,234.0,3,0.333333,4.642796,4.496913,7.760298,8.640988
4,2023,1390795,N,10,Y,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0,0,0,244.0,50.5,74.5,238.0,89.0,232.0,2,0.0,2.5,2.5,6.0,3.0


In [15]:
STUDENT = "mask_studentpersonkey"
YEAR = "SchoolYearFall"

In [16]:
X_index, X = create_student_features(
    features,
    student_col=STUDENT,
    year_col=YEAR,
)

In [17]:
sat_pred = model.predict(X)
sat_pred_int = np.round(sat_pred, 0).astype(int)

In [18]:
predictions = pd.DataFrame(
    {
        "mask_studentpersonkey": X_index["mask_studentpersonkey"],
        "pred_sat_math_score": sat_pred_int[:, 0],
        "pred_sat_verbal_score": sat_pred_int[:, 1],
    }
)

predictions.head()

Unnamed: 0,mask_studentpersonkey,pred_sat_math_score,pred_sat_verbal_score
0,1389182,567,589
1,1390795,574,596
2,1390797,502,541
3,1392398,593,621
4,1393973,506,531


In [19]:
def create_segment(row):
    """Create segment string based on has_XXX flags"""
    flags = []
    if row.get("has_rit", False):
        flags.append("RIT")
    if row.get("has_illuminate", False):
        flags.append("ILL")
    if row.get("has_milestone", False):
        flags.append("MS")

    return "+".join(flags) if flags else None

In [20]:
useful_cols = [
    "mask_studentpersonkey",
    "SchoolYearNumberFall",
    "GradeLevel",
    "CurrentEnrollment",
    "RetainedFlag",
    "SchoolDetailLevelDesc",
    "SchoolDetailReportName",
    "SchoolDetailLevelDescNextYear",
    "SchoolDetailReportNameNextYear",
    "ActiveGiftedStudentResultRecordFlag",
]

chatbot_student_dataset = (
    pd.merge(
        latest_current_cohort[useful_cols],
        current_cohort_with_assessments[
            [
                "mask_studentpersonkey",
                "has_illuminate",
                "has_milestone",
                "has_rit",
            ]
        ],
        on="mask_studentpersonkey",
        how="left",
        validate="1:1",
    )
    .merge(
        predictions,
        on="mask_studentpersonkey",
        how="left",
        validate="1:1",
    )
    .merge(
        sat_data,
        on=["mask_studentpersonkey"],
        how="left",
        validate="1:1",
    )
)

# fmt:off
chatbot_student_dataset["pred_sat_math_score"] = chatbot_student_dataset["pred_sat_math_score"].fillna(-1).astype(int)
chatbot_student_dataset["pred_sat_verbal_score"] = chatbot_student_dataset["pred_sat_verbal_score"].fillna(-1).astype(int)
chatbot_student_dataset["has_rit"] = chatbot_student_dataset["has_rit"].fillna(False)
chatbot_student_dataset["has_illuminate"] = chatbot_student_dataset["has_illuminate"].fillna(False)
chatbot_student_dataset["has_milestone"] = chatbot_student_dataset["has_milestone"].fillna(False)
chatbot_student_dataset["segment"] = chatbot_student_dataset.apply(create_segment, axis=1)

chatbot_student_dataset = pd.merge(
    chatbot_student_dataset,
    sat_uncertainty,
    on="segment",
    how="left",
    validate="m:1",
)

chatbot_student_dataset["upper_pred_sat_math_score"] = chatbot_student_dataset["pred_sat_math_score"] + chatbot_student_dataset["math_upper_threshold"]
chatbot_student_dataset["lower_pred_sat_math_score"] = chatbot_student_dataset["pred_sat_math_score"] + chatbot_student_dataset["math_lower_threshold"]
chatbot_student_dataset["upper_pred_sat_verbal_score"] = chatbot_student_dataset["pred_sat_verbal_score"] + chatbot_student_dataset["verbal_upper_threshold"]
chatbot_student_dataset["lower_pred_sat_verbal_score"] = chatbot_student_dataset["pred_sat_verbal_score"] + chatbot_student_dataset["verbal_lower_threshold"]

chatbot_student_dataset["upper_pred_sat_math_score"] = chatbot_student_dataset["upper_pred_sat_math_score"].fillna(-1).astype(int)
chatbot_student_dataset["lower_pred_sat_math_score"] = chatbot_student_dataset["lower_pred_sat_math_score"].fillna(-1).astype(int)
chatbot_student_dataset["upper_pred_sat_verbal_score"] = chatbot_student_dataset["upper_pred_sat_verbal_score"].fillna(-1).astype(int)
chatbot_student_dataset["lower_pred_sat_verbal_score"] = chatbot_student_dataset["lower_pred_sat_verbal_score"].fillna(-1).astype(int)
# fmt:on

ordered_cols = [
    "mask_studentpersonkey",
    "SchoolYearNumberFall",
    "GradeLevel",
    "CurrentEnrollment",
    "RetainedFlag",
    "SchoolDetailLevelDesc",
    "SchoolDetailReportName",
    "SchoolDetailLevelDescNextYear",
    "SchoolDetailReportNameNextYear",
    "ActiveGiftedStudentResultRecordFlag",
    "has_illuminate",
    "has_milestone",
    "has_rit",
    "latest_sat_math_score",
    "latest_sat_verbal_score",
    "previous_sat_year",
    "previous_sat_math_score",
    "previous_sat_verbal_score",
    "pred_sat_math_score",
    "pred_sat_verbal_score",
    "upper_pred_sat_math_score",
    "lower_pred_sat_math_score",
    "upper_pred_sat_verbal_score",
    "lower_pred_sat_verbal_score",
]

chatbot_student_dataset = chatbot_student_dataset[ordered_cols]
chatbot_student_dataset.head()

Unnamed: 0,mask_studentpersonkey,SchoolYearNumberFall,GradeLevel,CurrentEnrollment,RetainedFlag,SchoolDetailLevelDesc,SchoolDetailReportName,SchoolDetailLevelDescNextYear,SchoolDetailReportNameNextYear,ActiveGiftedStudentResultRecordFlag,has_illuminate,has_milestone,has_rit,latest_sat_math_score,latest_sat_verbal_score,previous_sat_year,previous_sat_math_score,previous_sat_verbal_score,pred_sat_math_score,pred_sat_verbal_score,upper_pred_sat_math_score,lower_pred_sat_math_score,upper_pred_sat_verbal_score,lower_pred_sat_verbal_score
0,1388557,2024,2,Y,N,Elementary School,FAST Fulton Academy of Science and Technology ES,Elementary School,FAST Fulton Academy of Science and Technology ES,Y,False,False,False,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,1389182,2024,11,Y,N,High School,Johns Creek High,High School,Johns Creek High,,False,True,True,550,600,2024,550,600,567,589,628,500,647,541
2,1389218,2024,2,Y,N,Elementary School,Abbotts Hill Elementary,Elementary School,Abbotts Hill Elementary,,False,False,False,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,1389220,2024,2,Y,N,Elementary School,Heards Ferry Elementary,Elementary School,Heards Ferry Elementary,,False,False,False,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,1389221,2024,2,Y,N,Elementary School,Heards Ferry Elementary,Elementary School,Heards Ferry Elementary,,False,False,False,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [21]:
chatbot_student_dataset.to_parquet("data/cleaned/chatbot_student_dataset.parquet")