In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import subprocess


def get_git_root():
    """Get the root directory of the git repository"""
    try:
        git_root = (
            subprocess.check_output(
                ["git", "rev-parse", "--show-toplevel"], stderr=subprocess.DEVNULL
            )
            .strip()
            .decode("utf-8")
        )
        return git_root
    except (subprocess.CalledProcessError, FileNotFoundError):
        return None


# Change to git root
git_root = get_git_root()
if git_root:
    os.chdir(git_root)
    print(f"Changed to git root: {os.getcwd()}")
else:
    print("Not in a git repository or git not found")

Changed to git root: /Users/sherman/GitHub/CSE-6748


In [3]:
import pandas as pd

import src.features.illuminate as illuminate
import src.features.index as index
import src.features.rit as rit
import src.features.student as student
import src.features.targets as targets
from src.feasy.sparkle import Sparkle
from src.utils import (
    NestedData,
    create_student_features,
    get_feature_functions_from_module,
    unnest,
)

pd.set_option("display.max_columns", None)
pd.options.mode.copy_on_write = True

In [None]:
student_profile = pd.read_pickle("data/cleaned/student_profile_dataset.pkl")
student_profile.head()

Unnamed: 0,mask_studentpersonkey,GradeLevel,CurrentEnrollment,FinalPrimaryEnrollmentForYearFlag,RetainedFlag,EnrollmentReasonDesc,WithDrawalReasonDesc,SchoolYearNumberFall,SchoolYearNumberSpring,SchoolStatusDesc,SchoolDetailReportName,SchoolDetailLevelDesc,SchoolStatusDescNextYear,SchoolDetailReportNameNextYear,SchoolDetailLevelDescNextYear,ActiveGiftedStudentResultRecordFlag,grad,milestone,illuminate,sat,rit
0,100075,,,,,,,2021,,,,,,,,,,,,"NestedData(MathScore=[350], VerbalScore=[410],...",
1,104981,,,,,,,2020,,,,,,,,,,,,"NestedData(MathScore=[520], VerbalScore=[500],...",
2,107055,6.0,N,Y,N,Transferred from another GA district,Year End,2023,2024.0,Open,Woodland Middle,Middle School,Open,Woodland Middle,Middle School,,,NestedData(SubjectDesc=['English Language Arts...,NestedData(responsedatevalue=[datetime.date(20...,,
3,114271,12.0,N,Y,Y,Continuing in same school,High School Graduation,2021,2022.0,Open,Johns Creek High,High School,,,,,,,,,
4,117427,12.0,N,Y,N,Continuing in same school,High School Graduation,2021,2022.0,Open,Northview High,High School,,,,,,,,,


In [5]:
# fmt:off
def analyze_dataset(df):
    """Comprehensive analysis of the merged student dataset"""
    
    import itertools

    import pandas as pd
    from IPython.display import display

    with pd.option_context(
        'display.max_columns', None,
        'display.width', None,
        'display.max_colwidth', None,
        'display.expand_frame_repr', False
    ):
    
        print("COMPREHENSIVE STUDENT DATASET ANALYSIS")
        print("=" * 70)
        
        # Basic metrics
        total_records = len(df)
        unique_students = df["mask_studentpersonkey"].nunique()
        
        # Current students filter
        current_students_df = df[df["CurrentEnrollment"] == "Y"]
        total_current_records = len(current_students_df)
        unique_current_students = current_students_df["mask_studentpersonkey"].nunique()
        
        # Dataset Overview
        overview_data = {
            'Metric': [
                'Total Records',
                'Unique Students', 
                'Average Records per Student',
                'Current Student Records',
                'Unique Current Students',
                'Average Records per Current Student'
            ],
            'Value': [
                f"{total_records:,}",
                f"{unique_students:,}",
                f"{total_records/unique_students:.2f}",
                f"{total_current_records:,}",
                f"{unique_current_students:,}",
                f"{total_current_records/unique_current_students:.2f}" if unique_current_students > 0 else "N/A"
            ]
        }
        
        print("\nDATASET OVERVIEW:")
        display(pd.DataFrame(overview_data))
        
        # Enrollment Coverage
        records_with_enrollment = df[~df["GradeLevel"].isnull()]
        records_missing_enrollment = df[df["GradeLevel"].isnull()]
        current_records_with_enrollment = current_students_df[~current_students_df["GradeLevel"].isnull()]
        current_records_missing_enrollment = current_students_df[current_students_df["GradeLevel"].isnull()]
        
        enrollment_data = {
            'Metric': [
                'Records with Enrollment',
                'Records missing Enrollment', 
                'Students with Enrollment',
                'Students missing Enrollment'
            ],
            'All Students': [
                f"{len(records_with_enrollment):,} ({len(records_with_enrollment)/total_records*100:.1f}%)",
                f"{len(records_missing_enrollment):,} ({len(records_missing_enrollment)/total_records*100:.1f}%)",
                f"{records_with_enrollment['mask_studentpersonkey'].nunique():,} ({records_with_enrollment['mask_studentpersonkey'].nunique()/unique_students*100:.1f}%)",
                f"{records_missing_enrollment['mask_studentpersonkey'].nunique():,} ({records_missing_enrollment['mask_studentpersonkey'].nunique()/unique_students*100:.1f}%)"
            ],
            'Current Students': [
                f"{len(current_records_with_enrollment):,} ({len(current_records_with_enrollment)/total_current_records*100:.1f}%)" if total_current_records > 0 else "0 (0.0%)",
                f"{len(current_records_missing_enrollment):,} ({len(current_records_missing_enrollment)/total_current_records*100:.1f}%)" if total_current_records > 0 else "0 (0.0%)",
                f"{current_records_with_enrollment['mask_studentpersonkey'].nunique():,} ({current_records_with_enrollment['mask_studentpersonkey'].nunique()/unique_current_students*100:.1f}%)" if unique_current_students > 0 else "0 (0.0%)",
                f"{current_records_missing_enrollment['mask_studentpersonkey'].nunique():,} ({current_records_missing_enrollment['mask_studentpersonkey'].nunique()/unique_current_students*100:.1f}%)" if unique_current_students > 0 else "0 (0.0%)"
            ]
        }
        
        print("\nENROLLMENT DATA COVERAGE:")
        display(pd.DataFrame(enrollment_data))
        
        # Assessment Coverage
        assessments = {
            "Graduation Data": "grad",
            "GA Milestones": "milestone", 
            "Illuminate": "illuminate",
            "SAT Scores": "sat",
            "RIT Scores": "rit",
        }
        
        assessment_data = []
        for name, col in assessments.items():
            # All students
            records_with_data = df[~df[col].isnull()]
            record_count = len(records_with_data)
            student_count = records_with_data["mask_studentpersonkey"].nunique()
            
            # Current students
            current_records_with_data = current_students_df[~current_students_df[col].isnull()]
            current_record_count = len(current_records_with_data)
            current_student_count = current_records_with_data["mask_studentpersonkey"].nunique()
            
            assessment_data.append({
                'Assessment': name,
                'All Students - Records': f"{record_count:,} ({record_count/total_records*100:.1f}%)",
                'All Students - Students': f"{student_count:,} ({student_count/unique_students*100:.1f}%)",
                'Current Students - Records': f"{current_record_count:,} ({current_record_count/total_current_records*100:.1f}%)" if total_current_records > 0 else "0 (0.0%)",
                'Current Students - Students': f"{current_student_count:,} ({current_student_count/unique_current_students*100:.1f}%)" if unique_current_students > 0 else "0 (0.0%)"
            })
        
        print("\nASSESSMENT DATA COVERAGE:")
        display(pd.DataFrame(assessment_data))
        
        # SAT Combination Analysis (All Students Only)
        non_sat_assessments = {k: v for k, v in assessments.items() if v != "sat"}
        sat_combinations = []
        
        # SAT + each individual assessment
        for name, col in non_sat_assessments.items():
            sat_combinations.append((f"SAT + {name}", ["sat", col]))
        
        # SAT + pairs
        for combo in itertools.combinations(non_sat_assessments.items(), 2):
            names = [item[0] for item in combo]
            cols = [item[1] for item in combo]
            sat_combinations.append((f"SAT + {' + '.join(names)}", ["sat"] + cols))
        
        # SAT + triplets
        for combo in itertools.combinations(non_sat_assessments.items(), 3):
            names = [item[0] for item in combo]
            cols = [item[1] for item in combo]
            sat_combinations.append((f"SAT + {' + '.join(names)}", ["sat"] + cols))
        
        # SAT + all
        all_names = list(non_sat_assessments.keys())
        all_cols = list(non_sat_assessments.values())
        sat_combinations.append((f"SAT + {' + '.join(all_names)}", ["sat"] + all_cols))
        
        sat_combo_data = []
        for combo_name, combo_cols in sat_combinations:
            # All students only
            mask = df[combo_cols[0]].notna()
            for col in combo_cols[1:]:
                mask = mask & df[col].notna()
            students_with_combo = df[mask]["mask_studentpersonkey"].nunique()
            
            sat_combo_data.append({
                'Combination': combo_name,
                'All Students': f"{students_with_combo:,} ({students_with_combo/unique_students*100:.1f}%)"
            })
        
        print("\nSAT COMBINATION ANALYSIS:")
        display(pd.DataFrame(sat_combo_data))
        
        # NON-SAT Combination Analysis (Current Students Only)
        if unique_current_students > 0:
            non_sat_combinations = []
            
            # Individual assessments (excluding SAT)
            for name, col in non_sat_assessments.items():
                non_sat_combinations.append((name, [col]))
            
            # Pairs of non-SAT assessments
            for combo in itertools.combinations(non_sat_assessments.items(), 2):
                names = [item[0] for item in combo]
                cols = [item[1] for item in combo]
                combo_name = f"{' + '.join(names)}"
                non_sat_combinations.append((combo_name, cols))
            
            # Triplets of non-SAT assessments
            for combo in itertools.combinations(non_sat_assessments.items(), 3):
                names = [item[0] for item in combo]
                cols = [item[1] for item in combo]
                combo_name = f"{' + '.join(names)}"
                non_sat_combinations.append((combo_name, cols))
            
            # All non-SAT assessments
            all_non_sat_names = list(non_sat_assessments.keys())
            all_non_sat_cols = list(non_sat_assessments.values())
            non_sat_combinations.append((f"{' + '.join(all_non_sat_names)}", all_non_sat_cols))
            
            current_combo_data = []
            for combo_name, combo_cols in non_sat_combinations:
                # Current students only
                current_mask = current_students_df[combo_cols[0]].notna()
                for col in combo_cols[1:]:
                    current_mask = current_mask & current_students_df[col].notna()
                
                current_students_with_combo = current_students_df[current_mask]["mask_studentpersonkey"].nunique()
                
                current_combo_data.append({
                    'Combination': combo_name,
                    'Current Students': f"{current_students_with_combo:,} ({current_students_with_combo/unique_current_students*100:.1f}%)"
                })
            
            print("\nNON-SAT COMBINATION ANALYSIS (CURRENT STUDENTS):")
            display(pd.DataFrame(current_combo_data))
        else:
            print("\nNON-SAT COMBINATION ANALYSIS (CURRENT STUDENTS):")
            print("No current students found in dataset.")

# fmt: on

analyze_dataset(student_profile)

COMPREHENSIVE STUDENT DATASET ANALYSIS

DATASET OVERVIEW:


Unnamed: 0,Metric,Value
0,Total Records,497758.0
1,Unique Students,152883.0
2,Average Records per Student,3.26
3,Current Student Records,87376.0
4,Unique Current Students,87376.0
5,Average Records per Current Student,1.0



ENROLLMENT DATA COVERAGE:


Unnamed: 0,Metric,All Students,Current Students
0,Records with Enrollment,"465,126 (93.4%)","87,376 (100.0%)"
1,Records missing Enrollment,"32,632 (6.6%)",0 (0.0%)
2,Students with Enrollment,"151,039 (98.8%)","87,376 (100.0%)"
3,Students missing Enrollment,"24,734 (16.2%)",0 (0.0%)



ASSESSMENT DATA COVERAGE:


Unnamed: 0,Assessment,All Students - Records,All Students - Students,Current Students - Records,Current Students - Students
0,Graduation Data,"29,316 (5.9%)","29,316 (19.2%)","7,241 (8.3%)","7,241 (8.3%)"
1,GA Milestones,"171,690 (34.5%)","86,976 (56.9%)",629 (0.7%),629 (0.7%)
2,Illuminate,"272,781 (54.8%)","120,820 (79.0%)","70,369 (80.5%)","70,369 (80.5%)"
3,SAT Scores,"27,107 (5.4%)","21,137 (13.8%)","5,863 (6.7%)","5,863 (6.7%)"
4,RIT Scores,"117,128 (23.5%)","82,381 (53.9%)","23,989 (27.5%)","23,989 (27.5%)"



SAT COMBINATION ANALYSIS:


Unnamed: 0,Combination,All Students
0,SAT + Graduation Data,73 (0.0%)
1,SAT + GA Milestones,"6,892 (4.5%)"
2,SAT + Illuminate,"5,395 (3.5%)"
3,SAT + RIT Scores,"11,729 (7.7%)"
4,SAT + Graduation Data + GA Milestones,62 (0.0%)
5,SAT + Graduation Data + Illuminate,16 (0.0%)
6,SAT + Graduation Data + RIT Scores,51 (0.0%)
7,SAT + GA Milestones + Illuminate,"3,904 (2.6%)"
8,SAT + GA Milestones + RIT Scores,"4,462 (2.9%)"
9,SAT + Illuminate + RIT Scores,"3,240 (2.1%)"



NON-SAT COMBINATION ANALYSIS (CURRENT STUDENTS):


Unnamed: 0,Combination,Current Students
0,Graduation Data,"7,241 (8.3%)"
1,GA Milestones,629 (0.7%)
2,Illuminate,"70,369 (80.5%)"
3,RIT Scores,"23,989 (27.5%)"
4,Graduation Data + GA Milestones,88 (0.1%)
5,Graduation Data + Illuminate,"6,883 (7.9%)"
6,Graduation Data + RIT Scores,"3,766 (4.3%)"
7,GA Milestones + Illuminate,301 (0.3%)
8,GA Milestones + RIT Scores,312 (0.4%)
9,Illuminate + RIT Scores,"20,539 (23.5%)"


In [6]:
contains_enrolment = student_profile["GradeLevel"].notna()
contains_sat = student_profile["sat"].notna()
contains_illuminate = student_profile["illuminate"].notna()
contains_milestone = student_profile["milestone"].notna()
contains_rit = student_profile["rit"].notna()
current_students = student_profile["CurrentEnrollment"] == "Y"

In [None]:
data = student_profile[
    contains_enrolment & contains_sat & (contains_illuminate | contains_rit)
]

print(data["mask_studentpersonkey"].nunique())
display(data.groupby(["GradeLevel"])[["mask_studentpersonkey"]].nunique())
data.head()

13149


Unnamed: 0_level_0,mask_studentpersonkey
GradeLevel,Unnamed: 1_level_1
10,774
11,6692
12,8192
7,2
8,2
9,64


Unnamed: 0,mask_studentpersonkey,GradeLevel,CurrentEnrollment,FinalPrimaryEnrollmentForYearFlag,RetainedFlag,EnrollmentReasonDesc,WithDrawalReasonDesc,SchoolYearNumberFall,SchoolYearNumberSpring,SchoolStatusDesc,SchoolDetailReportName,SchoolDetailLevelDesc,SchoolStatusDescNextYear,SchoolDetailReportNameNextYear,SchoolDetailLevelDescNextYear,ActiveGiftedStudentResultRecordFlag,grad,milestone,illuminate,sat,rit
52,1389182,10,N,Y,N,Continuing in same school,Year End,2023,2024,Open,Johns Creek High,High School,Open,Johns Creek High,High School,,,,,"NestedData(MathScore=[500], VerbalScore=[480],...","NestedData(TestDurationMinutes=[75, 56], TestR..."
53,1389182,11,Y,Y,N,Continuing in same school,Year End,2024,2025,Open,Johns Creek High,High School,Open,Johns Creek High,High School,,,,,"NestedData(MathScore=[550], VerbalScore=[600],...","NestedData(TestDurationMinutes=[70], TestRITSc..."
201,1390795,11,Y,Y,N,Continuing in same school,Year End,2024,2025,Open,North Springs High,High School,Open,North Springs High,High School,,,,NestedData(responsedatevalue=[datetime.date(20...,"NestedData(MathScore=[560], VerbalScore=[550],...",
375,1392398,11,Y,Y,N,Continuing in same school,Year End,2024,2025,Open,Milton High,High School,Open,Milton High,High School,Y,,,NestedData(responsedatevalue=[datetime.date(20...,"NestedData(MathScore=[610], VerbalScore=[680],...",
592,1395568,12,N,Y,N,Entered From Another State or U.S. Territory,High School Graduation,2022,2023,Open,North Springs High,High School,,,,,,,,"NestedData(MathScore=[560], VerbalScore=[560],...","NestedData(TestDurationMinutes=[80, 74], TestR..."


In [8]:
index_feat_funcs = get_feature_functions_from_module(index)
student_feat_funcs = get_feature_functions_from_module(student)
illuminate_feat_funcs = get_feature_functions_from_module(illuminate)
rit_feat_funcs = get_feature_functions_from_module(rit)
targets_feat_funcs = get_feature_functions_from_module(targets)

features = (
    index_feat_funcs
    + student_feat_funcs
    + illuminate_feat_funcs
    + rit_feat_funcs
    + targets_feat_funcs
)

In [None]:
sparkle = Sparkle(features).source(data, from_dataframe=True)
features = sparkle.to_pandas()
features.head()

Unnamed: 0,SchoolYearFall,mask_studentpersonkey,RetainedFlag,GradeLevel,is_gifted,grade_mode_numeric,improvement_first_to_last,is_on_sat_grade_level,last_percent_correct,low_mastery_rate_below_50,mastery_rate_above_80,math_improvement_first_to_last,math_low_mastery_rate_below_50,math_mastery_rate_above_80,math_mean_item_pct_correct,math_percent_extension,math_percent_reteach,math_percent_review_practice,math_slope_percent_correct_over_time,math_std_item_pct_correct,math_total_points_earned,math_total_points_possible,math_weighted_percent_correct,mean_item_percent_correct,mean_standard_percent_correct,mean_std_pct_ela,mean_std_pct_math,num_assessments,num_items,percent_extension,percent_reteach,percent_review_practice,slope_percent_correct_over_time,std_item_percent_correct,total_points_earned,total_points_possible,verbal_improvement_first_to_last,verbal_low_mastery_rate_below_50,verbal_mastery_rate_above_80,verbal_mean_item_pct_correct,verbal_percent_extension,verbal_percent_reteach,verbal_percent_review_practice,verbal_slope_percent_correct_over_time,verbal_std_item_pct_correct,verbal_total_points_earned,verbal_total_points_possible,verbal_weighted_percent_correct,weighted_overall_percent_correct,count_quintile_avg,count_quintile_hiavg,count_quintile_high,count_quintile_low,count_quintile_lowavg,max_rit_score,mean_percent_correct,mean_percentile,mean_rit_score,mean_test_duration,min_rit_score,num_tests,pct_quintile_high,std_percent_correct,std_percentile,std_rit_score,std_test_duration,sat_math_score,sat_verbal_score
0,2023,1389182,N,10,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,2,0,0,247.0,51.0,82.5,243.0,65.5,239.0,2,1.0,2.0,1.5,4.0,9.5,500,480
1,2024,1389182,N,11,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,248.0,51.0,79.0,248.0,70.0,248.0,1,0.0,0.0,0.0,0.0,0.0,550,600
2,2024,1390795,N,11,N,11,8.33,1,75.0,0.44,0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.7464,66.6668,66.6668,0.0,25,25,0.48,0.48,0.04,0.203162,3.533114,289.0,402.0,8.33,0.44,0.48,71.7464,0.48,0.48,0.04,0.203162,3.533114,289.0,402.0,0.718905,0.718905,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,560,550
3,2024,1392398,N,11,Y,11,-5.0,1,75.0,0.25,0.65625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.653125,80.989688,80.989688,0.0,32,32,0.625,0.3125,0.0625,-0.497966,9.389948,400.0,514.0,-5.0,0.25,0.65625,77.653125,0.625,0.3125,0.0625,-0.497966,9.389948,400.0,514.0,0.77821,0.77821,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,610,680
4,2022,1395568,N,12,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,2,0,0,256.0,56.0,87.0,251.5,77.0,247.0,2,1.0,7.0,1.0,4.5,3.0,560,560


In [10]:
STUDENT = "mask_studentpersonkey"
YEAR = "SchoolYearFall"
TARGET_MATH = "sat_math_score"
TARGET_VERBAL = "sat_verbal_score"

In [11]:
X_index, X = create_student_features(
    features,
    student_col=STUDENT,
    year_col=YEAR,
    target_cols=[TARGET_MATH, TARGET_VERBAL],
)

Y = (
    features.groupby(STUDENT)[[TARGET_MATH, TARGET_VERBAL]]
    .last()
    .reset_index(drop=True)
)

In [12]:
display(X_index.head())
display(X.head())
display(Y.head())

Unnamed: 0,mask_studentpersonkey
0,1389182
1,1390795
2,1392398
3,1395568
4,1395578


Unnamed: 0,RetainedFlag_latest,GradeLevel_latest,is_gifted_latest,grade_mode_numeric_latest,improvement_first_to_last_latest,is_on_sat_grade_level_latest,last_percent_correct_latest,low_mastery_rate_below_50_latest,mastery_rate_above_80_latest,math_improvement_first_to_last_latest,math_low_mastery_rate_below_50_latest,math_mastery_rate_above_80_latest,math_mean_item_pct_correct_latest,math_percent_extension_latest,math_percent_reteach_latest,math_percent_review_practice_latest,math_slope_percent_correct_over_time_latest,math_std_item_pct_correct_latest,math_total_points_earned_latest,math_total_points_possible_latest,math_weighted_percent_correct_latest,mean_item_percent_correct_latest,mean_standard_percent_correct_latest,mean_std_pct_ela_latest,mean_std_pct_math_latest,num_assessments_latest,num_items_latest,percent_extension_latest,percent_reteach_latest,percent_review_practice_latest,slope_percent_correct_over_time_latest,std_item_percent_correct_latest,total_points_earned_latest,total_points_possible_latest,verbal_improvement_first_to_last_latest,verbal_low_mastery_rate_below_50_latest,verbal_mastery_rate_above_80_latest,verbal_mean_item_pct_correct_latest,verbal_percent_extension_latest,verbal_percent_reteach_latest,verbal_percent_review_practice_latest,verbal_slope_percent_correct_over_time_latest,verbal_std_item_pct_correct_latest,verbal_total_points_earned_latest,verbal_total_points_possible_latest,verbal_weighted_percent_correct_latest,weighted_overall_percent_correct_latest,count_quintile_avg_latest,count_quintile_hiavg_latest,count_quintile_high_latest,count_quintile_low_latest,count_quintile_lowavg_latest,max_rit_score_latest,mean_percent_correct_latest,mean_percentile_latest,mean_rit_score_latest,mean_test_duration_latest,min_rit_score_latest,num_tests_latest,pct_quintile_high_latest,std_percent_correct_latest,std_percentile_latest,std_rit_score_latest,std_test_duration_latest,grade_mode_numeric_delta,improvement_first_to_last_delta,is_on_sat_grade_level_delta,last_percent_correct_delta,low_mastery_rate_below_50_delta,mastery_rate_above_80_delta,math_improvement_first_to_last_delta,math_low_mastery_rate_below_50_delta,math_mastery_rate_above_80_delta,math_mean_item_pct_correct_delta,math_percent_extension_delta,math_percent_reteach_delta,math_percent_review_practice_delta,math_slope_percent_correct_over_time_delta,math_std_item_pct_correct_delta,math_total_points_earned_delta,math_total_points_possible_delta,math_weighted_percent_correct_delta,mean_item_percent_correct_delta,mean_standard_percent_correct_delta,mean_std_pct_ela_delta,mean_std_pct_math_delta,num_assessments_delta,num_items_delta,percent_extension_delta,percent_reteach_delta,percent_review_practice_delta,slope_percent_correct_over_time_delta,std_item_percent_correct_delta,total_points_earned_delta,total_points_possible_delta,verbal_improvement_first_to_last_delta,verbal_low_mastery_rate_below_50_delta,verbal_mastery_rate_above_80_delta,verbal_mean_item_pct_correct_delta,verbal_percent_extension_delta,verbal_percent_reteach_delta,verbal_percent_review_practice_delta,verbal_slope_percent_correct_over_time_delta,verbal_std_item_pct_correct_delta,verbal_total_points_earned_delta,verbal_total_points_possible_delta,verbal_weighted_percent_correct_delta,weighted_overall_percent_correct_delta,count_quintile_avg_delta,count_quintile_hiavg_delta,count_quintile_high_delta,count_quintile_low_delta,count_quintile_lowavg_delta,max_rit_score_delta,mean_percent_correct_delta,mean_percentile_delta,mean_rit_score_delta,mean_test_duration_delta,min_rit_score_delta,num_tests_delta,pct_quintile_high_delta,std_percent_correct_delta,std_percentile_delta,std_rit_score_delta,std_test_duration_delta,num_snapshots,grade_mode_numeric_slope,improvement_first_to_last_slope,is_on_sat_grade_level_slope,last_percent_correct_slope,low_mastery_rate_below_50_slope,mastery_rate_above_80_slope,math_improvement_first_to_last_slope,math_low_mastery_rate_below_50_slope,math_mastery_rate_above_80_slope,math_mean_item_pct_correct_slope,math_percent_extension_slope,math_percent_reteach_slope,math_percent_review_practice_slope,math_slope_percent_correct_over_time_slope,math_std_item_pct_correct_slope,math_total_points_earned_slope,math_total_points_possible_slope,math_weighted_percent_correct_slope,mean_item_percent_correct_slope,mean_standard_percent_correct_slope,mean_std_pct_ela_slope,mean_std_pct_math_slope,num_assessments_slope,num_items_slope,percent_extension_slope,percent_reteach_slope,percent_review_practice_slope,slope_percent_correct_over_time_slope,std_item_percent_correct_slope,total_points_earned_slope,total_points_possible_slope,verbal_improvement_first_to_last_slope,verbal_low_mastery_rate_below_50_slope,verbal_mastery_rate_above_80_slope,verbal_mean_item_pct_correct_slope,verbal_percent_extension_slope,verbal_percent_reteach_slope,verbal_percent_review_practice_slope,verbal_slope_percent_correct_over_time_slope,verbal_std_item_pct_correct_slope,verbal_total_points_earned_slope,verbal_total_points_possible_slope,verbal_weighted_percent_correct_slope,weighted_overall_percent_correct_slope,count_quintile_avg_slope,count_quintile_hiavg_slope,count_quintile_high_slope,count_quintile_low_slope,count_quintile_lowavg_slope,max_rit_score_slope,mean_percent_correct_slope,mean_percentile_slope,mean_rit_score_slope,mean_test_duration_slope,min_rit_score_slope,num_tests_slope,pct_quintile_high_slope,std_percent_correct_slope,std_percentile_slope,std_rit_score_slope,std_test_duration_slope
0,N,11,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,248.0,51.0,79.0,248.0,70.0,248.0,1,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,-2,0,0,1.0,0.0,-3.5,5.0,4.5,9.0,-1,-1.0,-2.0,-1.5,-4.0,-9.5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-2.0,0.0,0.0,1.0,0.0,-3.5,5.0,4.5,9.0,-1.0,-1.0,-2.0,-1.5,-4.0,-9.5
1,N,11,N,11,8.33,1,75.0,0.44,0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.7464,66.6668,66.6668,0.0,25,25,0.48,0.48,0.04,0.203162,3.533114,289.0,402.0,8.33,0.44,0.48,71.7464,0.48,0.48,0.04,0.203162,3.533114,289.0,402.0,0.718905,0.718905,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,N,11,Y,11,-5.0,1,75.0,0.25,0.65625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.653125,80.989688,80.989688,0.0,32,32,0.625,0.3125,0.0625,-0.497966,9.389948,400.0,514.0,-5.0,0.25,0.65625,77.653125,0.625,0.3125,0.0625,-0.497966,9.389948,400.0,514.0,0.77821,0.77821,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,N,12,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,2,0,0,256.0,56.0,87.0,251.5,77.0,247.0,2,1.0,7.0,1.0,4.5,3.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Y,12,N,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0,0,0,238.0,53.0,61.0,234.0,42.0,230.0,2,0.0,3.0,2.0,4.0,4.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,sat_math_score,sat_verbal_score
0,550,600
1,560,550
2,610,680
3,560,560
4,510,450


In [13]:
X_index.to_parquet("data/cleaned/X_index.parquet")
X.to_parquet("data/cleaned/X.parquet")
Y.to_parquet("data/cleaned/Y.parquet")