In [179]:
import pandas as pd
import numpy as np
import polars as pl # Polars is a high-performance DataFrame library, designed to provide fast and efficient data processing capabilities

In [180]:
# Define custom season data type
season_dtype = pl.Enum(['Spring', 'Summer', 'Fall', 'Winter'])

# Read in data with custom season type
train = (
    pl.read_csv('data/train.csv')
    .with_columns(pl.col('^.*Season$').cast(season_dtype))
)

test = (
    pl.read_csv('data/test.csv')
    .with_columns(pl.col('^.*Season$').cast(season_dtype))
)

In [183]:
def clean_data(df):

    # Age: replace negatives and out-of-bounds values with null
    df = df.with_columns(
        pl.when((pl.col("Basic_Demos-Age") >= 0) & (pl.col("Basic_Demos-Age") <= 120))
        .then(pl.col("Basic_Demos-Age"))
        .otherwise(None)
        .alias("Basic_Demos-Age"),
        
        pl.when((pl.col("Physical-BMI") >= 10) & (pl.col("Physical-BMI") <= 60))
        .then(pl.col("Physical-BMI"))
        .otherwise(None)
        .alias("Physical-BMI"),
        
        pl.when((pl.col("BIA-BIA_BMI") >= 10) & (pl.col("BIA-BIA_BMI") <= 60))
        .then(pl.col("BIA-BIA_BMI"))
        .otherwise(None)
        .alias("BIA-BIA_BMI"),
    )

    # Replace invalid categorical values with NaN
    valid_values = {
        "Basic_Demos-Sex": [0, 1],
        "FGC-FGC_CU_Zone": [0, 1],
        "FGC-FGC_GSND_Zone": [1, 2, 3],
        "FGC-FGC_GSD_Zone": [1, 2, 3],
        "FGC-FGC_PU_Zone": [0, 1],
        "FGC-FGC_SRL_Zone": [0, 1],
        "FGC-FGC_SRR_Zone": [0, 1],
        "FGC-FGC_TL_Zone": [0, 1],
        "BIA-BIA_Activity_Level_num": [1, 2, 3, 4, 5],
        "BIA-BIA_Frame_num": [1, 2, 3],
        "PCIAT-PCIAT_01": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_02": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_03": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_04": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_05": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_06": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_07": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_08": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_09": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_10": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_11": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_12": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_13": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_14": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_15": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_16": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_17": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_18": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_19": [0, 1, 2, 3, 4, 5],
        "PCIAT-PCIAT_20": [0, 1, 2, 3, 4, 5],
    }
    for col, values in valid_values.items():
        if col in df.columns:
            df = df.with_columns(
                pl.when(pl.col(col).is_in(values))
                .then(pl.col(col))
                .otherwise(None)
                .alias(col)
            )

    
    # Drop rows with over 30% missing values
    # Calculate the threshold dynamically based on column count
    row_thresh = len(df.columns) * 0.3
    df = df.filter(
        df.select(pl.all().null_count()).to_series(0) <= row_thresh
    )
    
    return df

In [184]:
cleaned_data = clean_data(train)

In [186]:
print("Data cleaning complete. Summary:")
print(cleaned_data)
print(cleaned_data.describe())

Data cleaning complete. Summary:
shape: (3_960, 82)
┌──────────┬────────────────┬────────────────┬────────────────┬───┬────────────────┬────────────────┬───────────────┬──────┐
│ id       ┆ Basic_Demos-En ┆ Basic_Demos-Ag ┆ Basic_Demos-Se ┆ … ┆ SDS-SDS_Total_ ┆ PreInt_EduHx-S ┆ PreInt_EduHx- ┆ sii  │
│ ---      ┆ roll_Season    ┆ e              ┆ x              ┆   ┆ T              ┆ eason          ┆ computerinter ┆ ---  │
│ str      ┆ ---            ┆ ---            ┆ ---            ┆   ┆ ---            ┆ ---            ┆ net_…         ┆ i64  │
│          ┆ enum           ┆ i64            ┆ i64            ┆   ┆ i64            ┆ enum           ┆ ---           ┆      │
│          ┆                ┆                ┆                ┆   ┆                ┆                ┆ i64           ┆      │
╞══════════╪════════════════╪════════════════╪════════════════╪═══╪════════════════╪════════════════╪═══════════════╪══════╡
│ 00008ff9 ┆ Fall           ┆ 5              ┆ 0              ┆ … ┆ null 