In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import subprocess


def get_git_root():
    """Get the root directory of the git repository"""
    try:
        git_root = (
            subprocess.check_output(
                ["git", "rev-parse", "--show-toplevel"], stderr=subprocess.DEVNULL
            )
            .strip()
            .decode("utf-8")
        )
        return git_root
    except (subprocess.CalledProcessError, FileNotFoundError):
        return None


# Change to git root
git_root = get_git_root()
if git_root:
    os.chdir(git_root)
    print(f"Changed to git root: {os.getcwd()}")
else:
    print("Not in a git repository or git not found")

Changed to git root: /Users/sherman/GitHub/CSE-6748


In [3]:
import pandas as pd

import src.features.illuminate as illuminate
import src.features.index as index
import src.features.milestone as milestone
import src.features.rit as rit
import src.features.student as student
import src.features.targets as targets
from src.feasy.sparkle import Sparkle
from src.utils import create_student_features, get_feature_functions_from_module

pd.set_option("display.max_columns", None)
pd.options.mode.copy_on_write = True

In [4]:
student_profile = pd.read_pickle("data/cleaned/student_profile_dataset.pkl")
student_profile.head()

Unnamed: 0,mask_studentpersonkey,GradeLevel,CurrentEnrollment,FinalPrimaryEnrollmentForYearFlag,RetainedFlag,EnrollmentReasonDesc,WithDrawalReasonDesc,FirstDayOfSchoolYear,SchoolYearNumberFall,SchoolYearNumberSpring,SchoolStatusDesc,SchoolDetailReportName,SchoolDetailLevelDesc,SchoolStatusDescNextYear,SchoolDetailReportNameNextYear,SchoolDetailLevelDescNextYear,ActiveGiftedStudentResultRecordFlag,grad,milestone,illuminate,sat,rit
2,107055,6,N,Y,N,Transferred from another GA district,Year End,2023-07-19,2023,2024,Open,Woodland Middle,Middle School,Open,Woodland Middle,Middle School,,,NestedData(SubjectDesc=['English Language Arts...,NestedData(responsedatevalue=[datetime.date(20...,,
3,114271,12,N,Y,Y,Continuing in same school,High School Graduation,2021-07-24,2021,2022,Open,Johns Creek High,High School,,,,,,,,,
4,117427,12,N,Y,N,Continuing in same school,High School Graduation,2021-07-24,2021,2022,Open,Northview High,High School,,,,,,,,,
5,117773,12,N,Y,N,Continuing in same school,Court Order or Legal Requirement,2021-07-24,2021,2022,Open,Centennial High,High School,,,,,,,,,
6,117995,12,N,Y,N,Continuing in same school,High School Graduation,2021-07-24,2021,2022,Open,Roswell High,High School,,,,,,,,,


In [5]:
# fmt:off
def analyze_dataset(df):
    """Comprehensive analysis of the merged student dataset"""

    import itertools

    import pandas as pd
    from IPython.display import display

    with pd.option_context(
        'display.max_columns', None,
        'display.width', None,
        'display.max_colwidth', None,
        'display.expand_frame_repr', False
    ):

        print("COMPREHENSIVE STUDENT DATASET ANALYSIS")
        print("=" * 70)

        # Basic metrics
        total_records = len(df)
        unique_students = df["mask_studentpersonkey"].nunique()

        # Current students filter - get IDs of students who have at least one current enrollment record
        current_student_ids = df[df["CurrentEnrollment"] == "Y"]["mask_studentpersonkey"].unique()
        # Get ALL records for current students (including historical records)
        current_students_df = df[df["mask_studentpersonkey"].isin(current_student_ids)]
        total_current_records = len(current_students_df)
        unique_current_students = len(current_student_ids)

        # Dataset Overview
        overview_data = {
            'Metric': [
                'Total Records',
                'Unique Students',
                'Average Records per Student',
                'Current Student Records',
                'Unique Current Students',
                'Average Records per Current Student'
            ],
            'Value': [
                f"{total_records:,}",
                f"{unique_students:,}",
                f"{total_records/unique_students:.2f}",
                f"{total_current_records:,}",
                f"{unique_current_students:,}",
                f"{total_current_records/unique_current_students:.2f}" if unique_current_students > 0 else "N/A"
            ]
        }

        print("\nDATASET OVERVIEW:")
        display(pd.DataFrame(overview_data))

        # Enrollment Coverage
        records_with_enrollment = df[~df["GradeLevel"].isnull()]
        records_missing_enrollment = df[df["GradeLevel"].isnull()]
        current_records_with_enrollment = current_students_df[~current_students_df["GradeLevel"].isnull()]
        current_records_missing_enrollment = current_students_df[current_students_df["GradeLevel"].isnull()]

        enrollment_data = {
            'Metric': [
                'Records with Enrollment',
                'Records missing Enrollment',
                'Students with Enrollment',
                'Students missing Enrollment'
            ],
            'All Students': [
                f"{len(records_with_enrollment):,} ({len(records_with_enrollment)/total_records*100:.1f}%)",
                f"{len(records_missing_enrollment):,} ({len(records_missing_enrollment)/total_records*100:.1f}%)",
                f"{records_with_enrollment['mask_studentpersonkey'].nunique():,} ({records_with_enrollment['mask_studentpersonkey'].nunique()/unique_students*100:.1f}%)",
                f"{records_missing_enrollment['mask_studentpersonkey'].nunique():,} ({records_missing_enrollment['mask_studentpersonkey'].nunique()/unique_students*100:.1f}%)"
            ],
            'Current Students': [
                f"{len(current_records_with_enrollment):,} ({len(current_records_with_enrollment)/total_current_records*100:.1f}%)" if total_current_records > 0 else "0 (0.0%)",
                f"{len(current_records_missing_enrollment):,} ({len(current_records_missing_enrollment)/total_current_records*100:.1f}%)" if total_current_records > 0 else "0 (0.0%)",
                f"{current_records_with_enrollment['mask_studentpersonkey'].nunique():,} ({current_records_with_enrollment['mask_studentpersonkey'].nunique()/unique_current_students*100:.1f}%)" if unique_current_students > 0 else "0 (0.0%)",
                f"{current_records_missing_enrollment['mask_studentpersonkey'].nunique():,} ({current_records_missing_enrollment['mask_studentpersonkey'].nunique()/unique_current_students*100:.1f}%)" if unique_current_students > 0 else "0 (0.0%)"
            ]
        }

        print("\nENROLLMENT DATA COVERAGE:")
        display(pd.DataFrame(enrollment_data))

        # Assessment Coverage
        assessments = {
            "Graduation Data": "grad",
            "GA Milestones": "milestone",
            "Illuminate": "illuminate",
            "SAT Scores": "sat",
            "RIT Scores": "rit",
        }

        assessment_data = []
        for name, col in assessments.items():
            # All students
            records_with_data = df[~df[col].isnull()]
            record_count = len(records_with_data)
            student_count = records_with_data["mask_studentpersonkey"].nunique()

            # Current students - use ALL records for current students (including historical)
            current_records_with_data = current_students_df[~current_students_df[col].isnull()]
            current_record_count = len(current_records_with_data)
            # Count unique current students who have this assessment data
            current_student_count = current_records_with_data["mask_studentpersonkey"].nunique()

            assessment_data.append({
                'Assessment': name,
                'All Students - Records': f"{record_count:,} ({record_count/total_records*100:.1f}%)",
                'All Students - Students': f"{student_count:,} ({student_count/unique_students*100:.1f}%)",
                'Current Students - Records': f"{current_record_count:,} ({current_record_count/total_current_records*100:.1f}%)" if total_current_records > 0 else "0 (0.0%)",
                'Current Students - Students': f"{current_student_count:,} ({current_student_count/unique_current_students*100:.1f}%)" if unique_current_students > 0 else "0 (0.0%)"
            })

        print("\nASSESSMENT DATA COVERAGE:")
        display(pd.DataFrame(assessment_data))

        # SAT Combination Analysis (All Students Only)
        non_sat_assessments = {k: v for k, v in assessments.items() if v != "sat"}
        sat_combinations = []

        # SAT + each individual assessment
        for name, col in non_sat_assessments.items():
            sat_combinations.append((f"SAT + {name}", ["sat", col]))

        # SAT + pairs
        for combo in itertools.combinations(non_sat_assessments.items(), 2):
            names = [item[0] for item in combo]
            cols = [item[1] for item in combo]
            sat_combinations.append((f"SAT + {' + '.join(names)}", ["sat"] + cols))

        # SAT + triplets
        for combo in itertools.combinations(non_sat_assessments.items(), 3):
            names = [item[0] for item in combo]
            cols = [item[1] for item in combo]
            sat_combinations.append((f"SAT + {' + '.join(names)}", ["sat"] + cols))

        # SAT + all
        all_names = list(non_sat_assessments.keys())
        all_cols = list(non_sat_assessments.values())
        sat_combinations.append((f"SAT + {' + '.join(all_names)}", ["sat"] + all_cols))

        sat_combo_data = []
        for combo_name, combo_cols in sat_combinations:
            # All students only
            mask = df[combo_cols[0]].notna()
            for col in combo_cols[1:]:
                mask = mask & df[col].notna()
            students_with_combo = df[mask]["mask_studentpersonkey"].nunique()

            sat_combo_data.append({
                'Combination': combo_name,
                'All Students': f"{students_with_combo:,} ({students_with_combo/unique_students*100:.1f}%)"
            })

        print("\nSAT COMBINATION ANALYSIS:")
        display(pd.DataFrame(sat_combo_data))

        # NON-SAT Combination Analysis (Current Students Only)
        if unique_current_students > 0:
            non_sat_combinations = []

            # Individual assessments (excluding SAT)
            for name, col in non_sat_assessments.items():
                non_sat_combinations.append((name, [col]))

            # Pairs of non-SAT assessments
            for combo in itertools.combinations(non_sat_assessments.items(), 2):
                names = [item[0] for item in combo]
                cols = [item[1] for item in combo]
                combo_name = f"{' + '.join(names)}"
                non_sat_combinations.append((combo_name, cols))

            # Triplets of non-SAT assessments
            for combo in itertools.combinations(non_sat_assessments.items(), 3):
                names = [item[0] for item in combo]
                cols = [item[1] for item in combo]
                combo_name = f"{' + '.join(names)}"
                non_sat_combinations.append((combo_name, cols))

            # All non-SAT assessments
            all_non_sat_names = list(non_sat_assessments.keys())
            all_non_sat_cols = list(non_sat_assessments.values())
            non_sat_combinations.append((f"{' + '.join(all_non_sat_names)}", all_non_sat_cols))

            current_combo_data = []
            for combo_name, combo_cols in non_sat_combinations:
                # Current students only - using ALL records for current students (including historical)
                current_mask = current_students_df[combo_cols[0]].notna()
                for col in combo_cols[1:]:
                    current_mask = current_mask & current_students_df[col].notna()

                current_students_with_combo = current_students_df[current_mask]["mask_studentpersonkey"].nunique()

                current_combo_data.append({
                    'Combination': combo_name,
                    'Current Students': f"{current_students_with_combo:,} ({current_students_with_combo/unique_current_students*100:.1f}%)"
                })

            print("\nNON-SAT COMBINATION ANALYSIS (CURRENT STUDENTS):")
            display(pd.DataFrame(current_combo_data))
        else:
            print("\nNON-SAT COMBINATION ANALYSIS (CURRENT STUDENTS):")
            print("No current students found in dataset.")

# fmt: on

analyze_dataset(student_profile)

COMPREHENSIVE STUDENT DATASET ANALYSIS

DATASET OVERVIEW:


Unnamed: 0,Metric,Value
0,Total Records,385962.0
1,Unique Students,150306.0
2,Average Records per Student,2.57
3,Current Student Records,272009.0
4,Unique Current Students,87376.0
5,Average Records per Current Student,3.11



ENROLLMENT DATA COVERAGE:


Unnamed: 0,Metric,All Students,Current Students
0,Records with Enrollment,"385,962 (100.0%)","272,009 (100.0%)"
1,Records missing Enrollment,0 (0.0%),0 (0.0%)
2,Students with Enrollment,"150,306 (100.0%)","87,376 (100.0%)"
3,Students missing Enrollment,0 (0.0%),0 (0.0%)



ASSESSMENT DATA COVERAGE:


Unnamed: 0,Assessment,All Students - Records,All Students - Students,Current Students - Records,Current Students - Students
0,Graduation Data,"24,733 (6.4%)","24,733 (16.5%)","24,729 (9.1%)","24,729 (28.3%)"
1,GA Milestones,"168,969 (43.8%)","86,697 (57.7%)","122,564 (45.1%)","53,726 (61.5%)"
2,Illuminate,"267,037 (69.2%)","120,259 (80.0%)","209,360 (77.0%)","80,425 (92.0%)"
3,SAT Scores,"24,125 (6.3%)","19,424 (12.9%)","8,823 (3.2%)","6,950 (8.0%)"
4,RIT Scores,"114,814 (29.7%)","81,252 (54.1%)","82,783 (30.4%)","56,324 (64.5%)"



SAT COMBINATION ANALYSIS:


Unnamed: 0,Combination,All Students
0,SAT + Graduation Data,73 (0.0%)
1,SAT + GA Milestones,"6,892 (4.6%)"
2,SAT + Illuminate,"5,395 (3.6%)"
3,SAT + RIT Scores,"11,701 (7.8%)"
4,SAT + Graduation Data + GA Milestones,62 (0.0%)
5,SAT + Graduation Data + Illuminate,16 (0.0%)
6,SAT + Graduation Data + RIT Scores,51 (0.0%)
7,SAT + GA Milestones + Illuminate,"3,904 (2.6%)"
8,SAT + GA Milestones + RIT Scores,"4,462 (3.0%)"
9,SAT + Illuminate + RIT Scores,"3,240 (2.2%)"



NON-SAT COMBINATION ANALYSIS (CURRENT STUDENTS):


Unnamed: 0,Combination,Current Students
0,Graduation Data,"24,729 (28.3%)"
1,GA Milestones,"53,726 (61.5%)"
2,Illuminate,"80,425 (92.0%)"
3,RIT Scores,"56,324 (64.5%)"
4,Graduation Data + GA Milestones,"16,901 (19.3%)"
5,Graduation Data + Illuminate,"19,327 (22.1%)"
6,Graduation Data + RIT Scores,"15,108 (17.3%)"
7,GA Milestones + Illuminate,"51,346 (58.8%)"
8,GA Milestones + RIT Scores,"37,504 (42.9%)"
9,Illuminate + RIT Scores,"52,001 (59.5%)"


In [6]:
def create_sat_filtered_training_data(student_profile):
    """
    Filter students to include records only up to and including the year they first took SAT.

    For each student:
    - Find the earliest year they took SAT
    - Include all records for that student from the earliest year up to the SAT year
    - Exclude any records after the SAT year

    Examples:
    Student A: Grade 10 (SAT), Grade 11 (No SAT), Grade 12 (No SAT) -> Keep only Grade 10
    Student B: Grade 10 (No SAT), Grade 11 (SAT), Grade 12 (No SAT) -> Keep Grade 10-11
    """

    print("Creating SAT-filtered training dataset...")

    # First, identify students who have SAT scores
    students_with_sat = student_profile[student_profile["sat"].notna()].copy()

    if students_with_sat.empty:
        print("No students with SAT scores found.")
        return pd.DataFrame()

    print(
        f"Found {students_with_sat['mask_studentpersonkey'].nunique()} students with SAT scores"
    )

    # For each student, find their earliest SAT year
    sat_first_year = (
        students_with_sat.groupby("mask_studentpersonkey")["SchoolYearNumberFall"]
        .min()
        .reset_index()
        .rename(columns={"SchoolYearNumberFall": "first_sat_year"})
    )

    print("SAT year distribution:")
    display(sat_first_year[["first_sat_year"]].value_counts().sort_index())

    # Merge back with the full student profile to get all historical records
    student_profile_with_sat_year = pd.merge(
        student_profile,
        sat_first_year,
        on="mask_studentpersonkey",
        how="inner",
    )

    print(f"Total records for SAT students: {len(student_profile_with_sat_year)}")

    # Filter to include only records up to and including the first SAT year
    filtered_data = student_profile_with_sat_year[
        student_profile_with_sat_year["SchoolYearNumberFall"]
        <= student_profile_with_sat_year["first_sat_year"]
    ].copy()

    print(f"Records after filtering to SAT cutoff: {len(filtered_data)}")

    # Remove students who have no assessment data on the latest year
    # First, find each student's latest year
    latest_year_per_student = (
        filtered_data.groupby("mask_studentpersonkey")["SchoolYearNumberFall"]
        .max()
        .reset_index()
    )

    latest_year_per_student.columns = ["mask_studentpersonkey", "latest_year"]

    # Get records for each student's latest year only
    latest_year_records = pd.merge(
        filtered_data,
        latest_year_per_student,
        left_on=["mask_studentpersonkey", "SchoolYearNumberFall"],
        right_on=["mask_studentpersonkey", "latest_year"],
    )

    # Check which students have assessments in their latest year
    latest_year_has_illuminate = latest_year_records["illuminate"].notna()
    latest_year_has_milestone = latest_year_records["milestone"].notna()
    latest_year_has_rit = latest_year_records["rit"].notna()

    latest_year_has_assessments = (
        latest_year_has_illuminate | latest_year_has_milestone | latest_year_has_rit
    )

    # Get students who have assessments in their latest year
    students_with_latest_assessments = latest_year_records[latest_year_has_assessments][
        "mask_studentpersonkey"
    ].unique()

    # Filter filtered_data to keep only these students
    final_data = filtered_data[
        filtered_data["mask_studentpersonkey"].isin(students_with_latest_assessments)
    ].copy()

    print(f"Final records after assessment filter: {len(final_data)}")
    print(f"Final unique students: {final_data['mask_studentpersonkey'].nunique()}")

    # Show grade level distribution for each student's latest year
    print("\nGrade level distribution (latest year per student):")
    latest_year_per_student = (
        final_data.groupby("mask_studentpersonkey")["SchoolYearNumberFall"]
        .max()
        .reset_index()
    )

    latest_records = pd.merge(
        final_data,
        latest_year_per_student,
        on=["mask_studentpersonkey", "SchoolYearNumberFall"],
    )

    grade_dist = latest_records.groupby(["GradeLevel"])[
        ["mask_studentpersonkey"]
    ].nunique()

    display(grade_dist)

    return final_data.drop(columns=["first_sat_year"])


# Apply the new filtering logic
data = create_sat_filtered_training_data(student_profile)

Creating SAT-filtered training dataset...
Found 19424 students with SAT scores
SAT year distribution:


first_sat_year
2021              5894
2022              4434
2023              4766
2024              4330
Name: count, dtype: int64

Total records for SAT students: 52134
Records after filtering to SAT cutoff: 44128
Final records after assessment filter: 32497
Final unique students: 13557

Grade level distribution (latest year per student):


Unnamed: 0_level_0,mask_studentpersonkey
GradeLevel,Unnamed: 1_level_1
10,822
11,6977
12,5675
7,2
8,3
9,78


In [7]:
index_feat_funcs = get_feature_functions_from_module(index)
student_feat_funcs = get_feature_functions_from_module(student)
illuminate_feat_funcs = get_feature_functions_from_module(illuminate)
milestone_feat_funcs = get_feature_functions_from_module(milestone)
rit_feat_funcs = get_feature_functions_from_module(rit)
targets_feat_funcs = get_feature_functions_from_module(targets)

features = (
    index_feat_funcs
    + student_feat_funcs
    + illuminate_feat_funcs
    + milestone_feat_funcs
    + rit_feat_funcs
    + targets_feat_funcs
)

In [8]:
sparkle = Sparkle(features).source(data, from_dataframe=True)
features = sparkle.to_pandas()
features.head()

Unnamed: 0,SchoolYearFall,mask_studentpersonkey,RetainedFlag,GradeLevel,is_gifted,grade_mode_numeric,improvement_first_to_last,is_on_sat_grade_level,last_percent_correct,low_mastery_rate_below_50,mastery_rate_above_80,math_improvement_first_to_last,math_low_mastery_rate_below_50,math_mastery_rate_above_80,math_mean_item_pct_correct,math_percent_extension,math_percent_reteach,math_percent_review_practice,math_slope_percent_correct_over_time,math_std_item_pct_correct,math_total_points_earned,math_total_points_possible,math_weighted_percent_correct,mean_item_percent_correct,mean_standard_percent_correct,mean_std_pct_ela,mean_std_pct_math,num_assessments,num_items,percent_extension,percent_reteach,percent_review_practice,slope_percent_correct_over_time,std_item_percent_correct,total_points_earned,total_points_possible,verbal_improvement_first_to_last,verbal_low_mastery_rate_below_50,verbal_mastery_rate_above_80,verbal_mean_item_pct_correct,verbal_percent_extension,verbal_percent_reteach,verbal_percent_review_practice,verbal_slope_percent_correct_over_time,verbal_std_item_pct_correct,verbal_total_points_earned,verbal_total_points_possible,verbal_weighted_percent_correct,weighted_overall_percent_correct,achievement_level_mean_ela,achievement_level_mean_math,lexile_mean_ela,lexile_mean_math,mean_achievement_level_all,mean_lexile_score_all,mean_scale_score_all,num_subjects_tested,num_test_days,num_unique_test_dates,pct_proficient_all,pct_proficient_ela,pct_proficient_math,scale_score_improvement_all,scale_score_improvement_ela,scale_score_improvement_math,scale_score_mean_ela,scale_score_mean_math,scale_score_slope_all,scale_score_slope_ela,scale_score_slope_math,scale_score_std_ela,scale_score_std_math,std_scale_score_all,count_quintile_avg,count_quintile_hiavg,count_quintile_high,count_quintile_low,count_quintile_lowavg,max_rit_score,mean_percent_correct,mean_percentile,mean_rit_score,mean_test_duration,min_rit_score,num_tests,pct_quintile_high,std_percent_correct,std_percentile,std_rit_score,std_test_duration,sat_math_score,sat_verbal_score
0,2022,1389182,N,9,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,623.0,1,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,2,0,0,253.0,50.666667,78.666667,242.0,58.666667,232.0,3,0.666667,1.699673,15.369523,8.602325,18.116904,,
1,2023,1389182,N,10,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,2,0,0,247.0,51.0,82.5,243.0,65.5,239.0,2,1.0,2.0,1.5,4.0,9.5,500.0,480.0
2,2022,1390795,N,9,Y,9,11.76,0,100.0,0.235955,0.730337,11.76,0.313433,0.656716,71.757463,0.656716,0.343284,0.0,0.140152,22.851185,912.0,1295.0,0.704247,78.176966,79.138652,98.485,72.786119,89,89,0.730337,0.269663,0.0,0.229855,22.821853,1256.0,1647.0,0.0,0.0,0.954545,97.727273,0.954545,0.045455,0.0,0.0,3.006536,344.0,352.0,0.977273,0.762599,0.0,3.0,0.0,0.0,3.0,0.0,577.5,2,1,1,1.0,0.0,1.0,3.0,0.0,0.0,0.0,576.0,3.0,0.0,0.0,0.0,0.0,1.5,0,2,1,0,0,253.0,53.333333,82.666667,243.333333,82.0,234.0,3,0.333333,4.642796,4.496913,7.760298,8.640988,,
3,2023,1390795,N,10,Y,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0,0,0,244.0,50.5,74.5,238.0,89.0,232.0,2,0.0,2.5,2.5,6.0,3.0,,
4,2024,1390795,N,11,N,11,8.33,1,75.0,0.44,0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.7464,66.6668,66.6668,0.0,25,25,0.48,0.48,0.04,0.203162,3.533114,289.0,402.0,8.33,0.44,0.48,71.7464,0.48,0.48,0.04,0.203162,3.533114,289.0,402.0,0.718905,0.718905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,560.0,550.0


In [9]:
STUDENT = "mask_studentpersonkey"
YEAR = "SchoolYearFall"
TARGET_MATH = "sat_math_score"
TARGET_VERBAL = "sat_verbal_score"

In [10]:
X_index, X = create_student_features(
    features,
    student_col=STUDENT,
    year_col=YEAR,
    target_cols=[TARGET_MATH, TARGET_VERBAL],
)

Y = (
    features.sort_values([STUDENT, YEAR], ascending=True)
    .groupby(STUDENT)[[TARGET_MATH, TARGET_VERBAL]]
    .last()
    .reset_index(drop=True)
)

In [11]:
display(X_index.head())
display(X_index.shape)
display(X.head())
display(X.shape)
display(Y.head())
display(Y.shape)

Unnamed: 0,mask_studentpersonkey
0,1389182
1,1390795
2,1392398
3,1395568
4,1395578


(13557, 1)

Unnamed: 0,RetainedFlag_latest,GradeLevel_latest,is_gifted_latest,grade_mode_numeric_latest,improvement_first_to_last_latest,is_on_sat_grade_level_latest,last_percent_correct_latest,low_mastery_rate_below_50_latest,mastery_rate_above_80_latest,math_improvement_first_to_last_latest,math_low_mastery_rate_below_50_latest,math_mastery_rate_above_80_latest,math_mean_item_pct_correct_latest,math_percent_extension_latest,math_percent_reteach_latest,math_percent_review_practice_latest,math_slope_percent_correct_over_time_latest,math_std_item_pct_correct_latest,math_total_points_earned_latest,math_total_points_possible_latest,math_weighted_percent_correct_latest,mean_item_percent_correct_latest,mean_standard_percent_correct_latest,mean_std_pct_ela_latest,mean_std_pct_math_latest,num_assessments_latest,num_items_latest,percent_extension_latest,percent_reteach_latest,percent_review_practice_latest,slope_percent_correct_over_time_latest,std_item_percent_correct_latest,total_points_earned_latest,total_points_possible_latest,verbal_improvement_first_to_last_latest,verbal_low_mastery_rate_below_50_latest,verbal_mastery_rate_above_80_latest,verbal_mean_item_pct_correct_latest,verbal_percent_extension_latest,verbal_percent_reteach_latest,verbal_percent_review_practice_latest,verbal_slope_percent_correct_over_time_latest,verbal_std_item_pct_correct_latest,verbal_total_points_earned_latest,verbal_total_points_possible_latest,verbal_weighted_percent_correct_latest,weighted_overall_percent_correct_latest,achievement_level_mean_ela_latest,achievement_level_mean_math_latest,lexile_mean_ela_latest,lexile_mean_math_latest,mean_achievement_level_all_latest,mean_lexile_score_all_latest,mean_scale_score_all_latest,num_subjects_tested_latest,num_test_days_latest,num_unique_test_dates_latest,pct_proficient_all_latest,pct_proficient_ela_latest,pct_proficient_math_latest,scale_score_improvement_all_latest,scale_score_improvement_ela_latest,scale_score_improvement_math_latest,scale_score_mean_ela_latest,scale_score_mean_math_latest,scale_score_slope_all_latest,scale_score_slope_ela_latest,scale_score_slope_math_latest,scale_score_std_ela_latest,scale_score_std_math_latest,std_scale_score_all_latest,count_quintile_avg_latest,count_quintile_hiavg_latest,count_quintile_high_latest,count_quintile_low_latest,count_quintile_lowavg_latest,max_rit_score_latest,mean_percent_correct_latest,mean_percentile_latest,mean_rit_score_latest,mean_test_duration_latest,min_rit_score_latest,num_tests_latest,pct_quintile_high_latest,std_percent_correct_latest,std_percentile_latest,std_rit_score_latest,std_test_duration_latest,grade_mode_numeric_delta,improvement_first_to_last_delta,is_on_sat_grade_level_delta,last_percent_correct_delta,low_mastery_rate_below_50_delta,mastery_rate_above_80_delta,math_improvement_first_to_last_delta,math_low_mastery_rate_below_50_delta,math_mastery_rate_above_80_delta,math_mean_item_pct_correct_delta,math_percent_extension_delta,math_percent_reteach_delta,math_percent_review_practice_delta,math_slope_percent_correct_over_time_delta,math_std_item_pct_correct_delta,math_total_points_earned_delta,math_total_points_possible_delta,math_weighted_percent_correct_delta,mean_item_percent_correct_delta,mean_standard_percent_correct_delta,mean_std_pct_ela_delta,mean_std_pct_math_delta,num_assessments_delta,num_items_delta,percent_extension_delta,percent_reteach_delta,percent_review_practice_delta,slope_percent_correct_over_time_delta,std_item_percent_correct_delta,total_points_earned_delta,total_points_possible_delta,verbal_improvement_first_to_last_delta,verbal_low_mastery_rate_below_50_delta,verbal_mastery_rate_above_80_delta,verbal_mean_item_pct_correct_delta,verbal_percent_extension_delta,verbal_percent_reteach_delta,verbal_percent_review_practice_delta,verbal_slope_percent_correct_over_time_delta,verbal_std_item_pct_correct_delta,verbal_total_points_earned_delta,verbal_total_points_possible_delta,verbal_weighted_percent_correct_delta,weighted_overall_percent_correct_delta,achievement_level_mean_ela_delta,achievement_level_mean_math_delta,lexile_mean_ela_delta,lexile_mean_math_delta,mean_achievement_level_all_delta,mean_lexile_score_all_delta,mean_scale_score_all_delta,num_subjects_tested_delta,num_test_days_delta,num_unique_test_dates_delta,pct_proficient_all_delta,pct_proficient_ela_delta,pct_proficient_math_delta,scale_score_improvement_all_delta,scale_score_improvement_ela_delta,scale_score_improvement_math_delta,scale_score_mean_ela_delta,scale_score_mean_math_delta,scale_score_slope_all_delta,scale_score_slope_ela_delta,scale_score_slope_math_delta,scale_score_std_ela_delta,scale_score_std_math_delta,std_scale_score_all_delta,count_quintile_avg_delta,count_quintile_hiavg_delta,count_quintile_high_delta,count_quintile_low_delta,count_quintile_lowavg_delta,max_rit_score_delta,mean_percent_correct_delta,mean_percentile_delta,mean_rit_score_delta,mean_test_duration_delta,min_rit_score_delta,num_tests_delta,pct_quintile_high_delta,std_percent_correct_delta,std_percentile_delta,std_rit_score_delta,std_test_duration_delta,num_snapshots,grade_mode_numeric_slope,improvement_first_to_last_slope,is_on_sat_grade_level_slope,last_percent_correct_slope,low_mastery_rate_below_50_slope,mastery_rate_above_80_slope,math_improvement_first_to_last_slope,math_low_mastery_rate_below_50_slope,math_mastery_rate_above_80_slope,math_mean_item_pct_correct_slope,math_percent_extension_slope,math_percent_reteach_slope,math_percent_review_practice_slope,math_slope_percent_correct_over_time_slope,math_std_item_pct_correct_slope,math_total_points_earned_slope,math_total_points_possible_slope,math_weighted_percent_correct_slope,mean_item_percent_correct_slope,mean_standard_percent_correct_slope,mean_std_pct_ela_slope,mean_std_pct_math_slope,num_assessments_slope,num_items_slope,percent_extension_slope,percent_reteach_slope,percent_review_practice_slope,slope_percent_correct_over_time_slope,std_item_percent_correct_slope,total_points_earned_slope,total_points_possible_slope,verbal_improvement_first_to_last_slope,verbal_low_mastery_rate_below_50_slope,verbal_mastery_rate_above_80_slope,verbal_mean_item_pct_correct_slope,verbal_percent_extension_slope,verbal_percent_reteach_slope,verbal_percent_review_practice_slope,verbal_slope_percent_correct_over_time_slope,verbal_std_item_pct_correct_slope,verbal_total_points_earned_slope,verbal_total_points_possible_slope,verbal_weighted_percent_correct_slope,weighted_overall_percent_correct_slope,achievement_level_mean_ela_slope,achievement_level_mean_math_slope,lexile_mean_ela_slope,lexile_mean_math_slope,mean_achievement_level_all_slope,mean_lexile_score_all_slope,mean_scale_score_all_slope,num_subjects_tested_slope,num_test_days_slope,num_unique_test_dates_slope,pct_proficient_all_slope,pct_proficient_ela_slope,pct_proficient_math_slope,scale_score_improvement_all_slope,scale_score_improvement_ela_slope,scale_score_improvement_math_slope,scale_score_mean_ela_slope,scale_score_mean_math_slope,scale_score_slope_all_slope,scale_score_slope_ela_slope,scale_score_slope_math_slope,scale_score_std_ela_slope,scale_score_std_math_slope,std_scale_score_all_slope,count_quintile_avg_slope,count_quintile_hiavg_slope,count_quintile_high_slope,count_quintile_low_slope,count_quintile_lowavg_slope,max_rit_score_slope,mean_percent_correct_slope,mean_percentile_slope,mean_rit_score_slope,mean_test_duration_slope,min_rit_score_slope,num_tests_slope,pct_quintile_high_slope,std_percent_correct_slope,std_percentile_slope,std_rit_score_slope,std_test_duration_slope
0,N,10,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,2,0,0,247.0,51.0,82.5,243.0,65.5,239.0,2,1.0,2.0,1.5,4.0,9.5,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.0,0.0,-623.0,-1,-1,-1,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,0,0,0,0,-6.0,0.333333,3.833333,1.0,6.833333,7.0,-1,0.333333,0.300327,-13.869523,-4.602325,-8.616904,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.0,0.0,-623.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,-6.0,0.333333,3.833333,1.0,6.833333,7.0,-1.0,0.333333,0.300327,-13.869523,-4.602325,-8.616904
1,N,11,N,11,8.33,1,75.0,0.44,0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.7464,66.6668,66.6668,0.0,25,25,0.48,0.48,0.04,0.203162,3.533114,289.0,402.0,8.33,0.44,0.48,71.7464,0.48,0.48,0.04,0.203162,3.533114,289.0,402.0,0.718905,0.718905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,2,-3.43,1,-25.0,0.204045,-0.250337,-11.76,-0.313433,-0.656716,-71.757463,-0.656716,-0.343284,0.0,-0.140152,-22.851185,-912.0,-1295.0,-0.704247,-6.430566,-12.471852,-31.8182,-72.786119,-64,-64,-0.250337,0.210337,0.04,-0.026693,-19.288739,-967.0,-1245.0,8.33,0.44,-0.474545,-25.980873,-0.474545,0.434545,0.04,0.203162,0.526578,-55.0,50.0,-0.258367,-0.043693,0.0,-3.0,0.0,0.0,-3.0,0.0,-577.5,-2,-1,-1,-1.0,0.0,-1.0,-3.0,0.0,0.0,0.0,-576.0,-3.0,0.0,0.0,0.0,0.0,-1.5,0,-2,-1,0,0,-253.0,-53.333333,-82.666667,-243.333333,-82.0,-234.0,-3,-0.333333,-4.642796,-4.496913,-7.760298,-8.640988,3.0,1.0,-1.715,0.5,-12.5,0.102022,-0.125169,-5.88,-0.156716,-0.328358,-35.878731,-0.328358,-0.171642,0.0,-0.070076,-11.425593,-456.0,-647.5,-0.352124,-3.215283,-6.235926,-15.9091,-36.39306,-32.0,-32.0,-0.125169,0.105169,0.02,-0.013347,-9.644369,-483.5,-622.5,4.165,0.22,-0.237273,-12.990436,-0.237273,0.217273,0.02,0.101581,0.263289,-27.5,25.0,-0.129184,-0.021847,0.0,-1.5,0.0,0.0,-1.5,0.0,-288.75,-1.0,-0.5,-0.5,-0.5,0.0,-0.5,-1.5,0.0,0.0,0.0,-288.0,-1.5,0.0,0.0,0.0,0.0,-0.75,0.0,-1.0,-0.5,0.0,0.0,-126.5,-26.666667,-41.333333,-121.666667,-41.0,-117.0,-1.5,-0.166667,-2.321398,-2.248456,-3.880149,-4.320494
2,N,11,Y,11,-5.0,1,75.0,0.25,0.65625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.653125,80.989688,80.989688,0.0,32,32,0.625,0.3125,0.0625,-0.497966,9.389948,400.0,514.0,-5.0,0.25,0.65625,77.653125,0.625,0.3125,0.0625,-0.497966,9.389948,400.0,514.0,0.77821,0.77821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,11,-5.0,1,75.0,0.25,0.65625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.653125,80.989688,80.989688,0.0,32,32,0.625,0.3125,0.0625,-0.497966,9.389948,400.0,514.0,-5.0,0.25,0.65625,77.653125,0.625,0.3125,0.0625,-0.497966,9.389948,400.0,514.0,0.77821,0.77821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-2,0,0,-252.0,-56.0,-90.5,-248.0,-68.5,-244.0,-2,-1.0,-3.0,-0.5,-4.0,-7.5,3.0,5.5,-2.5,0.5,37.5,0.125,0.328125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.826563,40.494844,40.494844,0.0,16.0,16.0,0.3125,0.15625,0.03125,-0.248983,4.694974,200.0,257.0,-2.5,0.125,0.328125,38.826563,0.3125,0.15625,0.03125,-0.248983,4.694974,200.0,257.0,0.389105,0.389105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-126.0,-28.0,-45.25,-124.0,-34.25,-122.0,-1.0,-0.5,-1.5,-0.25,-2.0,-3.75
3,N,12,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,2,0,0,256.0,56.0,87.0,251.5,77.0,247.0,2,1.0,7.0,1.0,4.5,3.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Y,12,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0,0,0,238.0,53.0,61.0,234.0,42.0,230.0,2,0.0,3.0,2.0,4.0,4.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(13557, 259)

Unnamed: 0,sat_math_score,sat_verbal_score
0,500.0,480.0
1,560.0,550.0
2,610.0,680.0
3,560.0,560.0
4,510.0,450.0


(13557, 2)

In [12]:
X_index.to_parquet("data/cleaned/X_index.parquet")
X.to_parquet("data/cleaned/X.parquet")
Y.to_parquet("data/cleaned/Y.parquet")