# Import Libraries


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import subprocess


def get_git_root():
    """Get the root directory of the git repository"""
    try:
        git_root = (
            subprocess.check_output(
                ["git", "rev-parse", "--show-toplevel"], stderr=subprocess.DEVNULL
            )
            .strip()
            .decode("utf-8")
        )
        return git_root
    except (subprocess.CalledProcessError, FileNotFoundError):
        return None


# Change to git root
git_root = get_git_root()
if git_root:
    os.chdir(git_root)
    print(f"Changed to git root: {os.getcwd()}")
else:
    print("Not in a git repository or git not found")

Changed to git root: /Users/sherman/GitHub/CSE-6748


In [3]:
from datetime import datetime

import pandas as pd

from src.utils import nest

pd.set_option("display.max_columns", None)
pd.options.mode.copy_on_write = True

# School Year Calendar


In [4]:
school_year = pd.read_csv(
    "data/internal/SchoolYear.csv",
    usecols=[
        "SchoolYearId",
        "SchoolYearNumberFall",
        "SchoolYearNumberSpring",
        "FirstDayOfSchoolYear",
    ],
)

# fmt:off
school_year = school_year.iloc[:-1]
school_year["SchoolYearId"] = school_year["SchoolYearId"].astype(str)
school_year["SchoolYearNumberFall"] = school_year["SchoolYearNumberFall"].astype(str)
school_year["SchoolYearNumberSpring"] = school_year["SchoolYearNumberSpring"].astype(str)
school_year["FirstDayOfSchoolYear"] = pd.to_datetime(school_year["FirstDayOfSchoolYear"]).dt.date
# fmt:on
school_year.head()

Unnamed: 0,SchoolYearId,FirstDayOfSchoolYear,SchoolYearNumberFall,SchoolYearNumberSpring
0,32,2021-07-24,2021,2022
1,33,2022-07-21,2022,2023
2,34,2023-07-19,2023,2024
3,35,2024-07-19,2024,2025
4,36,2025-08-15,2025,2026


In [5]:
school_day = pd.read_csv(
    "data/internal/SchoolDay.csv",
    usecols=["dateid", "SchoolYearNumberFall", "SchoolYearNumberSpring", "DateValue"],
)

school_day = school_day.iloc[:-1]
school_day["DateValue"] = pd.to_datetime(school_day["DateValue"]).dt.date
school_day["dateid"] = school_day["dateid"].astype(str)
school_day = school_day.rename(columns={"dateid": "TestDateId"})
school_day.head()

Unnamed: 0,TestDateId,DateValue,SchoolYearNumberFall,SchoolYearNumberSpring
0,-1,1900-01-01,0,1991
1,1,1990-08-07,1990,1991
2,2,1990-08-08,1990,1991
3,3,1990-08-09,1990,1991
4,4,1990-08-10,1990,1991


# School Dimension


In [6]:
school_dim = pd.read_csv(
    "data/internal/DimSchool.csv",
    usecols=[
        "SchoolDetailFCSId",
        "SchoolDetailLevelDesc",
        "SchoolDetailReportName",
        "SchoolStatusDesc",
    ],
)

school_dim = school_dim[school_dim["SchoolStatusDesc"] == "Open"]
school_dim = school_dim.drop_duplicates(subset=["SchoolDetailFCSId"])
school_dim["SchoolDetailFCSId"] = school_dim["SchoolDetailFCSId"].astype(str)
school_dim.head()

Unnamed: 0,SchoolStatusDesc,SchoolDetailReportName,SchoolDetailFCSId,SchoolDetailLevelDesc
12,Open,High Point Elementary,228,Elementary School
13,Open,Mimosa Elementary,288,Elementary School
14,Open,Evoline C. West Elementary School,648,Elementary School
15,Open,College Park Elementary,48,Elementary School
16,Open,Brookview Elementary,54,Elementary School


# Enrolment Reason


In [7]:
enrolment_reason = pd.read_csv("data/internal/EnrollmentReason.csv")
enrolment_reason.head()

Unnamed: 0,EnrollmentReasonId,EnrollmentReasonCode,EnrollmentReasonDesc,ActiveDimEnrollmentReasonRecordFlag
0,1,6,Displaced due to natural disaster,Y
1,2,A,Admitted from home school,Y
2,3,B,"Re-entered after withdrawal, this school this ...",Y
3,4,C,Continuing in same school,Y
4,5,D,Entered from a Department of Defense School,Y


# Withdrawal Reason


In [8]:
withdrawal_reason = pd.read_csv("data/internal/WithdrawalReason.csv")
withdrawal_reason.head()

Unnamed: 0,WithdrawalReasonId,WithDrawalReasonCode,WithDrawalReasonDesc,ActiveDimWithdrawalReasonRecordFlag
0,1,YR,Year End,Y
1,2,1,SB10 Public Schools Transfer,Y
2,3,2,School Choice Transfer,Y
3,4,3,USCO,Y
4,5,4,Transferred Under the Jurisdiction of DJJ,Y


# Students


In [9]:
enrolment = pd.read_csv(
    "data/internal/Enrollment.csv",
    usecols=[
        "mask_studentpersonkey",
        "SchoolYearId",
        "SchoolDetailFCSId",
        "GradeLevel",
        "EnrollmentReasonId",
        "SchoolDetailFCSIdNextYear",
        "RetainedFlag",
        "CurrentEnrollment",
        "FinalPrimaryEnrollmentForYearFlag",
        "WithdrawalReasonId",
    ],
)

enrolment = enrolment[enrolment["FinalPrimaryEnrollmentForYearFlag"] == "Y"]
enrolment["mask_studentpersonkey"] = enrolment["mask_studentpersonkey"].astype(str)
enrolment["SchoolDetailFCSId"] = enrolment["SchoolDetailFCSId"].astype(str)
enrolment["SchoolYearId"] = enrolment["SchoolYearId"].astype(str)
enrolment["GradeLevel"] = enrolment["GradeLevel"].astype(str)
enrolment.head()

  enrolment = pd.read_csv(


Unnamed: 0,SchoolYearId,mask_studentpersonkey,SchoolDetailFCSId,GradeLevel,EnrollmentReasonId,WithdrawalReasonId,CurrentEnrollment,FinalPrimaryEnrollmentForYearFlag,RetainedFlag,SchoolDetailFCSIdNextYear
0,34,107055,693,6,14,1,N,Y,N,693
2,32,117995,852,12,4,13,N,Y,N,Unk
3,32,114271,741,12,4,13,N,Y,Y,Unk
4,32,117427,751,12,4,13,N,Y,N,Unk
5,32,117773,866,12,4,9,N,Y,N,Unk


In [10]:
enrolment = (
    pd.merge(enrolment, enrolment_reason, on=["EnrollmentReasonId"], how="left")
    .merge(withdrawal_reason, on=["WithdrawalReasonId"], how="left")
    .drop(
        columns=[
            "EnrollmentReasonId",
            "WithdrawalReasonId",
            "EnrollmentReasonCode",
            "WithDrawalReasonCode",
            "ActiveDimEnrollmentReasonRecordFlag",
            "ActiveDimWithdrawalReasonRecordFlag",
        ]
    )
)

enrolment.head()

Unnamed: 0,SchoolYearId,mask_studentpersonkey,SchoolDetailFCSId,GradeLevel,CurrentEnrollment,FinalPrimaryEnrollmentForYearFlag,RetainedFlag,SchoolDetailFCSIdNextYear,EnrollmentReasonDesc,WithDrawalReasonDesc
0,34,107055,693,6,N,Y,N,693,Transferred from another GA district,Year End
1,32,117995,852,12,N,Y,N,Unk,Continuing in same school,High School Graduation
2,32,114271,741,12,N,Y,Y,Unk,Continuing in same school,High School Graduation
3,32,117427,751,12,N,Y,N,Unk,Continuing in same school,High School Graduation
4,32,117773,866,12,N,Y,N,Unk,Continuing in same school,Court Order or Legal Requirement


# Gifted


In [11]:
gifted = pd.read_csv(
    "data/internal/Gifted Student Results.csv",
    usecols=[
        "mask_studentpersonkey",
        "TestDateId",
        "ActiveGiftedStudentResultRecordFlag",
    ],
)

gifted["mask_studentpersonkey"] = gifted["mask_studentpersonkey"].astype(str)
gifted["TestDateId"] = gifted["TestDateId"].astype(str)
gifted.head()

Unnamed: 0,TestDateId,ActiveGiftedStudentResultRecordFlag,mask_studentpersonkey
0,11475,D,330436
1,11475,D,333597
2,11475,D,577330
3,11475,D,576690
4,11475,D,575550


In [12]:
gifted = (
    pd.merge(gifted, school_day, on=["TestDateId"], how="left", validate="m:1")
    .drop(columns=["TestDateId", "DateValue", "SchoolYearNumberSpring"])
    .drop_duplicates(
        subset=["mask_studentpersonkey", "SchoolYearNumberFall"],
        keep="first",
    )
)

gifted["SchoolYearNumberFall"] = gifted["SchoolYearNumberFall"].astype(str)

gifted_agg = gifted.groupby(["mask_studentpersonkey", "SchoolYearNumberFall"]).agg(list)
gifted_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ActiveGiftedStudentResultRecordFlag
mask_studentpersonkey,SchoolYearNumberFall,Unnamed: 2_level_1
1388557,2022,[Y]
1389225,2022,[Y]
1389225,2023,[Y]
1389225,2024,[Y]
1389227,2022,[Y]


# Graduation Summary


In [13]:
# Only for final year students
graduation_summary = pd.read_csv(
    "data/internal/GraduationAreaSummary.csv",
    usecols=[
        "mask_studentpersonkey",
        "CurrentSchoolDetailFCSId",
        "SchoolYearNumberFall",
        "SubjectArea",
        "SubjectAreaCreditRequired",
        "AreaCredits",
        "AreaCreditStillNeeded",
    ],
)
# fmt:off
graduation_summary["SchoolYearNumberFall"] = graduation_summary["SchoolYearNumberFall"].astype(str)
# fmt:on
graduation_summary.head()

Unnamed: 0,mask_studentpersonkey,CurrentSchoolDetailFCSId,SchoolYearNumberFall,SubjectArea,SubjectAreaCreditRequired,AreaCredits,AreaCreditStillNeeded
0,350335,741,2023,Math,4,1.5,2.5
1,1946451,870,2022,Health/ PersonalFitness,1,0.5,0.5
2,1987664,880,2024,World Language/ FineArts/ CareerTech,3,1.0,2.0
3,1521002,804,2024,World Language/ FineArts/ CareerTech,3,1.5,1.5
4,317341,751,2021,Social Studies,3,2.5,0.5


In [14]:
graduation_summary = pd.merge(
    graduation_summary,
    school_year,
    on=["SchoolYearNumberFall"],
    how="left",
)

graduation_summary = graduation_summary.drop(
    columns=["SchoolYearNumberFall", "SchoolYearNumberSpring"]
).rename(columns={"CurrentSchoolDetailFCSId": "SchoolDetailFCSId"})

# fmt:off
graduation_summary["mask_studentpersonkey"] = graduation_summary["mask_studentpersonkey"].astype(str)
graduation_summary["SchoolDetailFCSId"] = graduation_summary["SchoolDetailFCSId"].astype(str)
graduation_summary["SchoolYearId"] = graduation_summary["SchoolYearId"].astype(str)
# fmt:on

grad_agg = graduation_summary.groupby(
    ["mask_studentpersonkey", "SchoolDetailFCSId", "SchoolYearId"]
).agg(list)

grad_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SubjectArea,SubjectAreaCreditRequired,AreaCredits,AreaCreditStillNeeded,FirstDayOfSchoolYear
mask_studentpersonkey,SchoolDetailFCSId,SchoolYearId,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1389182,741,33,"[Electives, ELA, World Language/ FineArts/ Car...","[4, 4, 3, 4, 1, 3, 4]","[2.0, 3.0, 4.5, 2.5, 0.5, 2.0, 2.5]","[2.0, 1.0, 0.0, 1.5, 0.5, 1.0, 1.5]","[2022-07-21, 2022-07-21, 2022-07-21, 2022-07-2..."
1390795,816,33,"[Math, ELA, Electives, World Language/ FineArt...","[4, 4, 4, 3, 4, 3]","[3.5, 2.5, 7.5, 7.5, 4.5, 1.5]","[0.5, 1.5, 0.0, 0.0, 0.0, 1.5]","[2022-07-21, 2022-07-21, 2022-07-21, 2022-07-2..."
1390797,816,33,"[Math, Social Studies, ELA, Science, Electives...","[4, 3, 4, 4, 4, 3, 1]","[2.5, 1.5, 2.5, 3.0, 7.0, 5.5, 0.5]","[1.5, 1.5, 1.5, 1.0, 0.0, 0.0, 0.5]","[2022-07-21, 2022-07-21, 2022-07-21, 2022-07-2..."
1392398,804,33,"[Electives, Health/ PersonalFitness, World Lan...","[4, 1, 3, 4, 4, 4, 3]","[3.0, 0.0, 4.5, 2.5, 2.5, 2.5, 1.5]","[1.0, 1.0, 0.0, 1.5, 1.5, 1.5, 1.5]","[2022-07-21, 2022-07-21, 2022-07-21, 2022-07-2..."
1393973,743,34,"[Health/ PersonalFitness, Math, Social Studies...","[1, 4, 3, 4, 3, 4, 4]","[0.5, 1.5, 1.0, 1.5, 3.0, 0.0, 1.5]","[0.5, 2.5, 2.0, 2.5, 0.0, 4.0, 2.5]","[2023-07-19, 2023-07-19, 2023-07-19, 2023-07-1..."


In [15]:
grad_agg_nest = nest(grad_agg, "grad", grad_agg.columns.tolist())
grad_agg_nest.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,grad
mask_studentpersonkey,SchoolDetailFCSId,SchoolYearId,Unnamed: 3_level_1
1389182,741,33,"NestedData(SubjectArea=['Electives', 'ELA', 'W..."
1390795,816,33,"NestedData(SubjectArea=['Math', 'ELA', 'Electi..."
1390797,816,33,"NestedData(SubjectArea=['Math', 'Social Studie..."
1392398,804,33,"NestedData(SubjectArea=['Electives', 'Health/ ..."
1393973,743,34,NestedData(SubjectArea=['Health/ PersonalFitne...


# Georgia Milestone Scores


In [16]:
milestone_scores = pd.read_csv(
    "data/internal/Georgia Milestones Scores.csv",
    usecols=[
        "StudentPersonKey_mask",
        "SchoolYearId",
        "SchoolDetailFCSId",
        "TestingDateId",
        "SubjectDesc",
        "TestGrade",
        "AchievementLevel",
        "LexileScore",
        "GradeConversionScore",
        "ScaleScore",
    ],
)

milestone_scores.head()

Unnamed: 0,SubjectDesc,SchoolYearId,TestingDateId,StudentPersonKey_mask,SchoolDetailFCSId,ScaleScore,AchievementLevel,GradeConversionScore,LexileScore,TestGrade
0,Algebra I,32,11319,356119,804,421,1,54.0,0.0,11
1,American Literature & Composition,32,11319,217864,852,497,2,73.0,1165.0,11
2,American Literature & Composition,32,11319,237883,852,597,4,92.0,1800.0,12
3,Algebra I,32,11319,236191,852,493,2,72.0,0.0,12
4,Biology,32,11319,196546,852,464,1,65.0,0.0,11


In [17]:
milestone_scores = milestone_scores.rename(
    columns={"StudentPersonKey_mask": "mask_studentpersonkey"}
)

# fmt:off
milestone_scores["mask_studentpersonkey"] = milestone_scores["mask_studentpersonkey"].astype(str)
milestone_scores["SchoolDetailFCSId"] = milestone_scores["SchoolDetailFCSId"].astype(str)
# fmt:on

milestone_scores["SchoolYearId"] = milestone_scores["SchoolYearId"].astype(str)

milestone_agg = (
    milestone_scores.sort_values(
        by=[
            "mask_studentpersonkey",
            "SchoolDetailFCSId",
            "SchoolYearId",
            "TestingDateId",
        ]
    )
    .groupby(["mask_studentpersonkey", "SchoolDetailFCSId", "SchoolYearId"])
    .agg(list)
)

milestone_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SubjectDesc,TestingDateId,ScaleScore,AchievementLevel,GradeConversionScore,LexileScore,TestGrade
mask_studentpersonkey,SchoolDetailFCSId,SchoolYearId,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
107055,693,34,"[English Language Arts, Mathematics]","[12299, 12299]","[490, 528]","[2, 3]","[nan, nan]","[760.0, nan]","[6, 6]"
1387704,693,32,"[English Language Arts, Mathematics]","[11571, 11571]","[566, 481]","[3, 2]","[nan, nan]","[1155.0, nan]","[6, 6]"
1387704,693,33,"[English Language Arts, Mathematics]","[11935, 11935]","[508, 499]","[2, 2]","[nan, nan]","[1095.0, nan]","[7, 7]"
1389182,741,33,[Biology],[11942],[623],[4],[93.0],[0.0],[9]
1389243,695,33,"[English Language Arts, Mathematics]","[11935, 11935]","[513, 524]","[2, 2]","[nan, nan]","[1095.0, nan]","[7, 7]"


In [18]:
milestone_agg_nest = nest(milestone_agg, "milestone", milestone_agg.columns.tolist())
milestone_agg_nest.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,milestone
mask_studentpersonkey,SchoolDetailFCSId,SchoolYearId,Unnamed: 3_level_1
107055,693,34,NestedData(SubjectDesc=['English Language Arts...
1387704,693,32,NestedData(SubjectDesc=['English Language Arts...
1387704,693,33,NestedData(SubjectDesc=['English Language Arts...
1389182,741,33,"NestedData(SubjectDesc=['Biology'], TestingDat..."
1389243,695,33,NestedData(SubjectDesc=['English Language Arts...


# SAT Scores


In [19]:
sat_scores = pd.read_csv(
    "data/internal/Fact_SATSummaryTest.csv",
    usecols=[
        "mask_studentpersonkey",
        "SchoolDetailFCSId",
        "TestingDateId",
        "MathScore",
        "VerbalScore",
        "TotalScore",
        "MathPercentile",
        "VerbalPercentile",
    ],
)

sat_scores.head()

Unnamed: 0,TestingDateId,MathScore,VerbalScore,TotalScore,MathPercentile,VerbalPercentile,mask_studentpersonkey,SchoolDetailFCSId
0,10652,730,640,1370,97,88,220288,751
1,11016,730,660,1390,97,92,306968,754
2,11040,370,380,750,9,11,430528,914
3,11051,750,690,1440,98,96,219659,751
4,11051,640,640,1280,89,88,218026,751


In [20]:
sat_scores["mask_studentpersonkey"] = sat_scores["mask_studentpersonkey"].astype(str)
sat_scores["SchoolDetailFCSId"] = sat_scores["SchoolDetailFCSId"].astype(str)
sat_scores["TestingDateId"] = sat_scores["TestingDateId"].astype(str)

sat_scores = sat_scores.rename(columns={"TestingDateId": "TestDateId"})

sat_scores = pd.merge(
    sat_scores, school_day, on=["TestDateId"], how="left", validate="m:1"
).drop(columns=["TestDateId", "DateValue", "SchoolYearNumberSpring"])

sat_scores["SchoolYearNumberFall"] = sat_scores["SchoolYearNumberFall"].astype(str)

In [21]:
sat_agg = sat_scores.groupby(
    ["mask_studentpersonkey", "SchoolDetailFCSId", "SchoolYearNumberFall"]
).agg(list)

sat_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,MathScore,VerbalScore,TotalScore,MathPercentile,VerbalPercentile
mask_studentpersonkey,SchoolDetailFCSId,SchoolYearNumberFall,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100075,740,2021,[350],[410],[760],[5],[19]
104981,852,2020,[520],[500],[1020],[57],[48]
120717,845,2024,[520],[640],[1160],[57],[88]
128475,740,2022,[530],[530],[1060],[61],[58]
134389,751,2024,"[740, 730]","[470, 550]","[1210, 1280]","[98, 97]","[38, 65]"


In [22]:
sat_agg_nest = nest(sat_agg, "sat", sat_agg.columns.tolist())
sat_agg_nest.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sat
mask_studentpersonkey,SchoolDetailFCSId,SchoolYearNumberFall,Unnamed: 3_level_1
100075,740,2021,"NestedData(MathScore=[350], VerbalScore=[410],..."
104981,852,2020,"NestedData(MathScore=[520], VerbalScore=[500],..."
120717,845,2024,"NestedData(MathScore=[520], VerbalScore=[640],..."
128475,740,2022,"NestedData(MathScore=[530], VerbalScore=[530],..."
134389,751,2024,"NestedData(MathScore=[740, 730], VerbalScore=[..."


# Illuminate Data


In [23]:
illu_2022 = pd.read_csv(
    "data/internal/IlluminateData2022.csv",
    encoding="latin-1",
    usecols=[
        "Mask_StudentPersonkey",
        "schoolyearnumberspring",
        "CurrentSchoolDetailFCSID",
        "AssessmentGradeLevel",
        "GradeLevelDuringUnitTest",
        "Standard_Subject",
        "responsedatevalue",
        "condition",
        "Response_points",
        "Response_points_possible",
        "Response_percent_correct",
        "Standard_points",
        "Standard_points_possible",
        "Standard_percent_correct",
        "ContinuousAchievementLevel",
    ],
)

illu_2022.head()

  illu_2022 = pd.read_csv(


Unnamed: 0,schoolyearnumberspring,Mask_StudentPersonkey,responsedatevalue,Response_points,Response_points_possible,Response_percent_correct,Standard_Subject,Standard_points,Standard_points_possible,Standard_percent_correct,condition,GradeLevelDuringUnitTest,ContinuousAchievementLevel,AssessmentGradeLevel,CurrentSchoolDetailFCSID
0,2022,763538,2021-09-19 00:00:00,5.0,15.0,33.33,Mathematics,1.0,7.0,14.29,Reteach,3.0,On Level,3,492
1,2022,862425,2022-05-11 00:00:00,6.0,8.0,75.0,English Language Arts,1.0,1.0,100.0,Extension,6.0,On Level,6,696
2,2022,582651,2022-01-28 00:00:00,16.0,17.0,94.12,Mathematics,2.0,2.0,100.0,Extension,3.0,On Level,3,615
3,2022,584570,2021-10-29 00:00:00,7.0,10.0,70.0,English Language Arts,1.0,1.0,100.0,Extension,4.0,On Level,4,120
4,2022,550130,2022-02-23 00:00:00,13.0,15.0,86.67,Mathematics,2.0,2.0,100.0,Extension,7.0,Accelerated,8,686


In [24]:
illu_2023 = pd.read_csv(
    "data/internal/IlluminateData2023.csv",
    encoding="latin-1",
    usecols=[
        "Mask_StudentPersonkey",
        "schoolyearnumberspring",
        "CurrentSchoolDetailFCSID",
        "AssessmentGradeLevel",
        "GradeLevelDuringUnitTest",
        "Standard_Subject",
        "responsedatevalue",
        "condition",
        "Response_points",
        "Response_points_possible",
        "Response_percent_correct",
        "Standard_points",
        "Standard_points_possible",
        "Standard_percent_correct",
        "ContinuousAchievementLevel",
    ],
)

illu_2023.head()

Unnamed: 0,schoolyearnumberspring,Mask_StudentPersonkey,responsedatevalue,Response_points,Response_points_possible,Response_percent_correct,Standard_Subject,Standard_points,Standard_points_possible,Standard_percent_correct,condition,GradeLevelDuringUnitTest,ContinuousAchievementLevel,AssessmentGradeLevel,CurrentSchoolDetailFCSID
0,2023,405692,2022-10-20 00:00:00,13.0,13.0,100.0,Mathematics,1.0,1.0,100.0,Extension,5,On Level,5,188
1,2023,772909,2023-02-24 00:00:00,12.0,15.0,80.0,Mathematics,3.0,3.0,100.0,Extension,8,On Level,8,706
2,2023,710128,2023-05-17 00:00:00,10.0,10.0,100.0,English Language Arts,1.0,1.0,100.0,Extension,5,On Level,5,672
3,2023,581938,2022-12-08 00:00:00,14.0,14.0,100.0,Mathematics,4.0,4.0,100.0,Extension,1,On Level,1,270
4,2023,459392,2022-11-18 00:00:00,9.0,17.0,52.94,Mathematics,1.0,2.0,50.0,Reteach,8,On Level,8,695


In [25]:
illu_2024 = pd.read_csv(
    "data/internal/IlluminateData2024.csv",
    encoding="latin-1",
    usecols=[
        "Mask_StudentPersonkey",
        "schoolyearnumberspring",
        "CurrentSchoolDetailFCSID",
        "AssessmentGradeLevel",
        "GradeLevelDuringUnitTest",
        "Standard_Subject",
        "responsedatevalue",
        "condition",
        "Response_points",
        "Response_points_possible",
        "Response_percent_correct",
        "Standard_points",
        "Standard_points_possible",
        "Standard_percent_correct",
        "ContinuousAchievementLevel",
    ],
)

illu_2024.head()

Unnamed: 0,schoolyearnumberspring,Mask_StudentPersonkey,responsedatevalue,Response_points,Response_points_possible,Response_percent_correct,Standard_Subject,Standard_points,Standard_points_possible,Standard_percent_correct,condition,GradeLevelDuringUnitTest,ContinuousAchievementLevel,AssessmentGradeLevel,CurrentSchoolDetailFCSID
0,2024,416890,2023-09-22 00:00:00,10.0,15.0,66.67,Georgias K-12 Mathematics Standards,0.0,1.0,0.0,Reteach,6,On Level,6,692
1,2024,762380,2023-11-03 00:00:00,7.0,17.0,41.18,English Language Arts,1.0,1.0,100.0,Extension,11,Not Applicable,11,914
2,2024,712301,2023-10-31 00:00:00,5.0,13.0,38.46,English Language Arts,0.0,1.0,0.0,Reteach,5,Not Applicable,5,980
3,2024,472942,2024-02-08 00:00:00,12.0,13.0,92.31,English Language Arts,1.0,1.0,100.0,Extension,3,On Level,3,607
4,2024,575441,2023-09-14 00:00:00,9.0,10.0,90.0,English Language Arts,1.0,1.0,100.0,Extension,4,Advanced,4,607


In [26]:
illu_2025 = pd.read_csv(
    "data/internal/IlluminateData2025.csv",
    encoding="latin-1",
    usecols=[
        "Mask_StudentPersonkey",
        "schoolyearnumberspring",
        "CurrentSchoolDetailFCSID",
        "AssessmentGradeLevel",
        "GradeLevelDuringUnitTest",
        "Standard_Subject",
        "responsedatevalue",
        "condition",
        "Response_points",
        "Response_points_possible",
        "Response_percent_correct",
        "Standard_points",
        "Standard_points_possible",
        "Standard_percent_correct",
        "ContinuousAchievementLevel",
    ],
)

illu_2025.head()

Unnamed: 0,schoolyearnumberspring,Mask_StudentPersonkey,responsedatevalue,Response_points,Response_points_possible,Response_percent_correct,Standard_Subject,Standard_points,Standard_points_possible,Standard_percent_correct,condition,GradeLevelDuringUnitTest,ContinuousAchievementLevel,AssessmentGradeLevel,CurrentSchoolDetailFCSID
0,2025,406954,2025-01-13 00:00:00,10.0,14.0,71.43,English Language Arts,2.0,2.0,100.0,Extension,6,On Level,6,682
1,2025,405935,2024-10-14 00:00:00,10.0,14.0,71.43,Georgias K-12 Mathematics Standards,2.0,2.0,100.0,Extension,6,On Level,6,697
2,2025,728618,2025-02-14 00:00:00,12.0,17.0,70.59,English Language Arts,1.0,1.0,100.0,Extension,9,On Level,10,845
3,2025,728056,2024-12-19 00:00:00,11.5,15.0,76.67,Georgias K-12 Mathematics Standards,3.0,3.0,100.0,Extension,6,Advanced,7,699
4,2025,728145,2025-02-11 00:00:00,12.5,13.0,96.15,English Language Arts,1.0,1.0,100.0,Extension,3,On Level,3,654


In [27]:
illu = pd.concat([illu_2022, illu_2023, illu_2024, illu_2025], axis=0)
illu.head()

Unnamed: 0,schoolyearnumberspring,Mask_StudentPersonkey,responsedatevalue,Response_points,Response_points_possible,Response_percent_correct,Standard_Subject,Standard_points,Standard_points_possible,Standard_percent_correct,condition,GradeLevelDuringUnitTest,ContinuousAchievementLevel,AssessmentGradeLevel,CurrentSchoolDetailFCSID
0,2022,763538,2021-09-19 00:00:00,5.0,15.0,33.33,Mathematics,1.0,7.0,14.29,Reteach,3.0,On Level,3,492
1,2022,862425,2022-05-11 00:00:00,6.0,8.0,75.0,English Language Arts,1.0,1.0,100.0,Extension,6.0,On Level,6,696
2,2022,582651,2022-01-28 00:00:00,16.0,17.0,94.12,Mathematics,2.0,2.0,100.0,Extension,3.0,On Level,3,615
3,2022,584570,2021-10-29 00:00:00,7.0,10.0,70.0,English Language Arts,1.0,1.0,100.0,Extension,4.0,On Level,4,120
4,2022,550130,2022-02-23 00:00:00,13.0,15.0,86.67,Mathematics,2.0,2.0,100.0,Extension,7.0,Accelerated,8,686


In [28]:
def standardize_grade_level_values(grade_value):
    """Standardize grade level values to consistent format"""
    if pd.isna(grade_value):
        return None

    # Convert to string and clean
    grade_str = str(grade_value).strip()

    # Handle special cases first
    if grade_str.upper() in ["KINDERGARTEN", "KK", "K"]:
        return "K"
    elif grade_str.upper() in ["PK", "PRE-K", "PREK"]:
        return "PK"
    elif grade_str.upper() in ["HS", "HIGH SCHOOL"]:
        return "HS"

    # Handle numeric grades - ensure they're single digits for K-12
    try:
        # Remove any leading zeros and convert to integer
        grade_num = int(float(grade_str))
        if 1 <= grade_num <= 12:
            return str(grade_num)
    except (ValueError, TypeError):
        pass

    # Return original if can't standardize
    return grade_str

In [29]:
# fmt:off
illu["AssessmentGradeLevel"] = illu["AssessmentGradeLevel"].apply(standardize_grade_level_values)
illu["GradeLevelDuringUnitTest"] = illu["GradeLevelDuringUnitTest"].apply(standardize_grade_level_values)
# fmt:on

In [30]:
illu["responsedatevalue"] = pd.to_datetime(illu["responsedatevalue"]).dt.date
illu["schoolyearnumberspring"] = illu["schoolyearnumberspring"].astype(str)

illu = illu.rename(
    columns={
        "schoolyearnumberspring": "SchoolYearNumberSpring",
        "Mask_StudentPersonkey": "mask_studentpersonkey",
        "CurrentSchoolDetailFCSID": "SchoolDetailFCSId",
    }
)

illu = pd.merge(illu, school_year, on=["SchoolYearNumberSpring"], how="left")
illu = illu.drop(columns=["SchoolYearNumberFall", "SchoolYearNumberSpring"])
illu.head()

Unnamed: 0,mask_studentpersonkey,responsedatevalue,Response_points,Response_points_possible,Response_percent_correct,Standard_Subject,Standard_points,Standard_points_possible,Standard_percent_correct,condition,GradeLevelDuringUnitTest,ContinuousAchievementLevel,AssessmentGradeLevel,SchoolDetailFCSId,SchoolYearId,FirstDayOfSchoolYear
0,763538,2021-09-19,5.0,15.0,33.33,Mathematics,1.0,7.0,14.29,Reteach,3,On Level,3,492,32,2021-07-24
1,862425,2022-05-11,6.0,8.0,75.0,English Language Arts,1.0,1.0,100.0,Extension,6,On Level,6,696,32,2021-07-24
2,582651,2022-01-28,16.0,17.0,94.12,Mathematics,2.0,2.0,100.0,Extension,3,On Level,3,615,32,2021-07-24
3,584570,2021-10-29,7.0,10.0,70.0,English Language Arts,1.0,1.0,100.0,Extension,4,On Level,4,120,32,2021-07-24
4,550130,2022-02-23,13.0,15.0,86.67,Mathematics,2.0,2.0,100.0,Extension,7,Accelerated,8,686,32,2021-07-24


In [31]:
illu["mask_studentpersonkey"] = illu["mask_studentpersonkey"].astype(str)
illu["SchoolDetailFCSId"] = illu["SchoolDetailFCSId"].astype(str)
illu["SchoolYearId"] = illu["SchoolYearId"].astype(str)

illu_agg = (
    illu.sort_values(
        by=[
            "mask_studentpersonkey",
            "SchoolDetailFCSId",
            "SchoolYearId",
            "responsedatevalue",
        ]
    )
    .groupby(["mask_studentpersonkey", "SchoolDetailFCSId", "SchoolYearId"])
    .agg(list)
)

illu_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,responsedatevalue,Response_points,Response_points_possible,Response_percent_correct,Standard_Subject,Standard_points,Standard_points_possible,Standard_percent_correct,condition,GradeLevelDuringUnitTest,ContinuousAchievementLevel,AssessmentGradeLevel,FirstDayOfSchoolYear
mask_studentpersonkey,SchoolDetailFCSId,SchoolYearId,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
107055,693,34,"[2024-02-08, 2024-02-08, 2024-02-08, 2024-02-0...","[7.5, 7.5, 7.5, 7.5, 3.67, 3.67, 3.67, 8.0, 8....","[13.0, 13.0, 13.0, 13.0, 10.0, 10.0, 10.0, 16....","[57.69, 57.69, 57.69, 57.69, 36.7, 36.7, 36.7,...","[Georgias K-12 Mathematics Standards, Georgia...","[5.0, 2.0, 0.5, 1.0, 1.67, 2.0, 0.0, 0.0, 2.0,...","[8.0, 3.0, 3.0, 1.0, 4.0, 3.0, 3.0, 1.0, 4.0, ...","[62.5, 66.67, 16.67, 100.0, 41.75, 66.67, 0.0,...","[Reteach, Reteach, Reteach, Extension, Reteach...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...","[2023-07-19, 2023-07-19, 2023-07-19, 2023-07-1..."
1387704,693,32,"[2022-03-16, 2022-03-16, 2022-03-16, 2022-03-1...","[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 7.0, ...","[13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13....","[30.77, 30.77, 30.77, 30.77, 30.77, 30.77, 30....","[Mathematics, Mathematics, Mathematics, Mathem...","[1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, ...","[2.0, 1.0, 3.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, ...","[50.0, 0.0, 0.0, 0.0, 50.0, 100.0, 50.0, 0.0, ...","[Reteach, Reteach, Reteach, Reteach, Reteach, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...","[On Level, On Level, On Level, On Level, On Le...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...","[2021-07-24, 2021-07-24, 2021-07-24, 2021-07-2..."
1387704,693,33,"[2023-03-08, 2023-03-08, 2023-03-08, 2023-03-0...","[9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 7.0, 7.0, ...","[14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 11....","[64.29, 64.29, 64.29, 64.29, 64.29, 64.29, 64....","[English Language Arts, English Language Arts,...","[1.0, 2.0, 3.0, 0.0, 1.0, 1.0, 1.0, 2.0, 1.0, ...","[1.0, 4.0, 4.0, 1.0, 1.0, 2.0, 1.0, 2.0, 2.0, ...","[100.0, 50.0, 75.0, 0.0, 100.0, 50.0, 100.0, 1...","[Extension, Reteach, Review & Practice, Reteac...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]","[On Level, On Level, On Level, On Level, On Le...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]","[2022-07-21, 2022-07-21, 2022-07-21, 2022-07-2..."
1389218,601,33,"[2022-09-19, 2022-09-19, 2022-09-19, 2022-09-1...","[21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21....","[35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35....","[60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60....","[Mathematics, Mathematics, Mathematics, Mathem...","[0.0, 2.0, 5.0, 5.0, 6.0, 0.0, 1.0, 0.0, 2.0, ...","[2.0, 3.0, 5.0, 11.0, 8.0, 1.0, 1.0, 2.0, 2.0,...","[0.0, 66.67, 100.0, 45.45, 75.0, 0.0, 100.0, 0...","[Reteach, Reteach, Extension, Reteach, Review ...","[K, K, K, K, K, K, K, K, K, K, K, K, K, K, K, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[K, K, K, K, K, K, K, K, K, K, K, K, K, K, K, ...","[2022-07-21, 2022-07-21, 2022-07-21, 2022-07-2..."
1389218,601,34,"[2023-09-22, 2023-09-22, 2023-09-22, 2023-09-2...","[13.0, 13.0, 13.0, 13.0, 13.0, 12.0, 12.0, 12....","[18.0, 18.0, 18.0, 18.0, 18.0, 15.0, 15.0, 15....","[72.22, 72.22, 72.22, 72.22, 72.22, 80.0, 80.0...","[Georgias K-12 Mathematics Standards, Georgia...","[6.0, 5.0, 1.0, 1.0, 1.0, 0.0, 5.0, 2.0, 0.0, ...","[6.0, 6.0, 2.0, 4.0, 2.0, 1.0, 6.0, 2.0, 1.0, ...","[100.0, 83.33, 50.0, 25.0, 50.0, 0.0, 83.33, 1...","[Extension, Review & Practice, Reteach, Reteac...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2023-07-19, 2023-07-19, 2023-07-19, 2023-07-1..."


In [32]:
illu_agg_nest = nest(illu_agg, "illuminate", illu_agg.columns.tolist())
illu_agg_nest.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,illuminate
mask_studentpersonkey,SchoolDetailFCSId,SchoolYearId,Unnamed: 3_level_1
107055,693,34,NestedData(responsedatevalue=[datetime.date(20...
1387704,693,32,NestedData(responsedatevalue=[datetime.date(20...
1387704,693,33,NestedData(responsedatevalue=[datetime.date(20...
1389218,601,33,NestedData(responsedatevalue=[datetime.date(20...
1389218,601,34,NestedData(responsedatevalue=[datetime.date(20...


# MAP Subject RIT Score


In [33]:
map_rit_scores = pd.read_csv(
    "data/internal/Map Subject Test Data.csv",
    usecols=[
        "mask_studentpersonkey",
        "SchoolDetailFCSId",
        "TestingDateId",
        "TestDurationMinutes",
        "TestRITScore",
        "TestPercentile",
        "PercentCorrect",
        "AchievementQuintile",
    ],
)

map_rit_scores.head()

Unnamed: 0,TestingDateId,mask_studentpersonkey,SchoolDetailFCSId,TestDurationMinutes,TestRITScore,TestPercentile,AchievementQuintile,PercentCorrect
0,11756,372350,687,93,185,3,Low,45
1,11756,418318,687,59,179,1,Low,26
2,11756,779939,687,89,214,47,Avg,48
3,11756,371484,687,144,208,33,LoAvg,53
4,11763,332500,687,18,184,3,Low,30


In [34]:
# fmt:off
map_rit_scores["mask_studentpersonkey"] = map_rit_scores["mask_studentpersonkey"].astype(str)
# fmt:on

map_rit_scores["SchoolDetailFCSId"] = map_rit_scores["SchoolDetailFCSId"].astype(str)
map_rit_scores["TestingDateId"] = map_rit_scores["TestingDateId"].astype(str)
map_rit_scores = map_rit_scores.rename(columns={"TestingDateId": "TestDateId"})

map_rit_scores = pd.merge(
    map_rit_scores,
    school_day,
    on=["TestDateId"],
    how="left",
    validate="m:1",
)

# fmt:off
map_rit_scores["SchoolYearNumberFall"] = map_rit_scores["SchoolYearNumberFall"].astype(str)
# fmt:on

map_rit_scores.head()

Unnamed: 0,TestDateId,mask_studentpersonkey,SchoolDetailFCSId,TestDurationMinutes,TestRITScore,TestPercentile,AchievementQuintile,PercentCorrect,DateValue,SchoolYearNumberFall,SchoolYearNumberSpring
0,11756,372350,687,93,185,3,Low,45,2022-10-13,2022,2023
1,11756,418318,687,59,179,1,Low,26,2022-10-13,2022,2023
2,11756,779939,687,89,214,47,Avg,48,2022-10-13,2022,2023
3,11756,371484,687,144,208,33,LoAvg,53,2022-10-13,2022,2023
4,11763,332500,687,18,184,3,Low,30,2022-10-20,2022,2023


In [None]:
rit_agg = (
    map_rit_scores.sort_values(
        [
            "mask_studentpersonkey",
            "SchoolDetailFCSId",
            "SchoolYearNumberFall",
            "TestDateId",
        ]
    )
    .groupby(["mask_studentpersonkey", "SchoolDetailFCSId", "SchoolYearNumberFall"])
    .agg(list)
)

rit_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TestDateId,TestDurationMinutes,TestRITScore,TestPercentile,AchievementQuintile,PercentCorrect,DateValue,SchoolYearNumberSpring
mask_studentpersonkey,SchoolDetailFCSId,SchoolYearNumberFall,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
133481,870,2022,"[11713, 11713]","[25, 33]","[172, 172]","[1, 1]","[Low, Low]","[26, 33]","[2022-08-31, 2022-08-31]","[2023, 2023]"
1389182,741,2022,"[11700, 11700, 11848]","[65, 77, 34]","[241, 253, 232]","[88, 91, 57]","[High, High, Avg]","[50, 53, 49]","[2022-08-18, 2022-08-18, 2023-01-13]","[2023, 2023, 2023]"
1389182,741,2023,"[12068, 12069]","[56, 75]","[239, 247]","[84, 81]","[High, High]","[53, 49]","[2023-08-21, 2023-08-22]","[2024, 2024]"
1389182,741,2024,[12449],[70],[248],[79],[HiAvg],[51],[2024-09-05],[2025]
1389243,695,2022,"[11749, 11763]","[47, 22]","[225, 214]","[58, 47]","[Avg, Avg]","[51, 39]","[2022-10-06, 2022-10-20]","[2023, 2023]"


In [None]:
rit_agg_nest = nest(rit_agg, "rit", rit_agg.columns.tolist())
rit_agg_nest.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rit
mask_studentpersonkey,SchoolDetailFCSId,SchoolYearNumberFall,Unnamed: 3_level_1
133481,870,2022,"NestedData(TestDateId=['11713', '11713'], Test..."
1389182,741,2022,"NestedData(TestDateId=['11700', '11700', '1184..."
1389182,741,2023,"NestedData(TestDateId=['12068', '12069'], Test..."
1389182,741,2024,"NestedData(TestDateId=['12449'], TestDurationM..."
1389243,695,2022,"NestedData(TestDateId=['11749', '11763'], Test..."


# Merge


In [None]:
merged = (
    pd.merge(
        enrolment,
        school_year,
        on=["SchoolYearId"],
        how="left",
    )
    .merge(
        school_dim,
        on=["SchoolDetailFCSId"],
        how="left",
    )
    .merge(
        school_dim.rename(columns={"SchoolDetailFCSId": "SchoolDetailFCSIdNextYear"}),
        on=["SchoolDetailFCSIdNextYear"],
        how="left",
        suffixes=("", "NextYear"),
    )
    .merge(
        gifted,
        on=["mask_studentpersonkey", "SchoolYearNumberFall"],
        how="outer",
        validate="m:1",
    )
    .merge(
        grad_agg_nest,
        on=["mask_studentpersonkey", "SchoolDetailFCSId", "SchoolYearId"],
        how="outer",
    )
    .merge(
        milestone_agg_nest,
        on=["mask_studentpersonkey", "SchoolDetailFCSId", "SchoolYearId"],
        how="outer",
    )
    .merge(
        illu_agg_nest,
        on=["mask_studentpersonkey", "SchoolDetailFCSId", "SchoolYearId"],
        how="outer",
    )
    .merge(
        sat_agg_nest,
        on=["mask_studentpersonkey", "SchoolDetailFCSId", "SchoolYearNumberFall"],
        how="outer",
    )
    .merge(
        rit_agg_nest,
        on=["mask_studentpersonkey", "SchoolDetailFCSId", "SchoolYearNumberFall"],
        how="outer",
    )
    .drop(columns=["SchoolYearId", "SchoolDetailFCSId", "SchoolDetailFCSIdNextYear"])
)

merged.head()

Unnamed: 0,mask_studentpersonkey,GradeLevel,CurrentEnrollment,FinalPrimaryEnrollmentForYearFlag,RetainedFlag,EnrollmentReasonDesc,WithDrawalReasonDesc,FirstDayOfSchoolYear,SchoolYearNumberFall,SchoolYearNumberSpring,SchoolStatusDesc,SchoolDetailReportName,SchoolDetailLevelDesc,SchoolStatusDescNextYear,SchoolDetailReportNameNextYear,SchoolDetailLevelDescNextYear,ActiveGiftedStudentResultRecordFlag,grad,milestone,illuminate,sat,rit
0,100075,,,,,,,,2021,,,,,,,,,,,,"NestedData(MathScore=[350], VerbalScore=[410],...",
1,104981,,,,,,,,2020,,,,,,,,,,,,"NestedData(MathScore=[520], VerbalScore=[500],...",
2,107055,6.0,N,Y,N,Transferred from another GA district,Year End,2023-07-19,2023,2024.0,Open,Woodland Middle,Middle School,Open,Woodland Middle,Middle School,,,NestedData(SubjectDesc=['English Language Arts...,NestedData(responsedatevalue=[datetime.date(20...,,
3,114271,12.0,N,Y,Y,Continuing in same school,High School Graduation,2021-07-24,2021,2022.0,Open,Johns Creek High,High School,,,,,,,,,
4,117427,12.0,N,Y,N,Continuing in same school,High School Graduation,2021-07-24,2021,2022.0,Open,Northview High,High School,,,,,,,,,


In [38]:
# Remove rows in the future
current_date = datetime.now().date()
merged = merged[merged["FirstDayOfSchoolYear"] < pd.to_datetime(current_date).date()]

In [39]:
merged.to_pickle("data/cleaned/student_profile_dataset.pkl")