In [1]:
import polars as pl
from sklearn.preprocessing import LabelEncoder 
import numpy as np
import plotly.express as px
import json

In [2]:
df = pl.scan_csv("Data/training_set_features.csv")

In [3]:
labels = pl.scan_csv("Data/training_set_labels.csv")

In [4]:
fts = df.join(labels, on="respondent_id", how="left")

In [5]:
le_age_group = LabelEncoder()
le_education = LabelEncoder()
le_race = LabelEncoder()
le_sex = LabelEncoder()
le_income_poverty = LabelEncoder()
le_marital_status = LabelEncoder()
le_rent_or_own = LabelEncoder()
le_employment_status = LabelEncoder()
le_hhs_geo_region = LabelEncoder()
le_census_msa = LabelEncoder()
le_employment_industry = LabelEncoder()
le_employment_occupation = LabelEncoder()

In [6]:
fts_nonull = fts.drop('health_insurance', 'employment_industry', 'employment_occupation').drop_nulls()
fts_nonull = fts_nonull.with_columns(
    pl.col("age_group").map_batches(le_age_group.fit_transform),
    pl.col("education").map_batches(le_education.fit_transform),
    pl.col("race").map_batches(le_race.fit_transform),	
    pl.col("sex").map_batches(le_sex.fit_transform),
    pl.col("income_poverty").map_batches(le_income_poverty.fit_transform),
    pl.col("marital_status").map_batches(le_marital_status.fit_transform),
    pl.col("rent_or_own").map_batches(le_rent_or_own.fit_transform),
    pl.col("employment_status").map_batches(le_employment_status.fit_transform),
    pl.col("hhs_geo_region").map_batches(le_hhs_geo_region.fit_transform),
    pl.col("census_msa").map_batches(le_census_msa.fit_transform),
)

In [7]:
fts_nonull.collect().write_parquet("ETL/fts_nonull.parquet")

In [8]:
le_fts_nonull = {"le_age_group": le_age_group.classes_.tolist(), "le_education": le_education.classes_.tolist(),"le_race": le_race.classes_.tolist(),"le_sex": le_sex.classes_.tolist(),
                 "le_income_poverty": le_income_poverty.classes_.tolist(), "le_marital_status": le_marital_status.classes_.tolist(),
                 "le_rent_or_own": le_rent_or_own.classes_.tolist(), "le_employment_status": le_employment_status.classes_.tolist(),
                 "le_hhs_geo_region": le_hhs_geo_region.classes_.tolist(), "le_census_msa": le_census_msa.classes_.tolist()}

In [9]:
with open("ETL/le_fts_nonull.json", "w") as writefile:
    writefile.write(json.dumps(le_fts_nonull))

In [10]:
fts_null = fts.drop('health_insurance', 'employment_industry', 'employment_occupation')
fts_null = fts_null.with_columns(
    pl.col("age_group").map_batches(le_age_group.fit_transform),
    pl.col("education").map_batches(le_education.fit_transform),
    pl.col("race").map_batches(le_race.fit_transform),	
    pl.col("sex").map_batches(le_sex.fit_transform),
    pl.col("income_poverty").map_batches(le_income_poverty.fit_transform),
    pl.col("marital_status").map_batches(le_marital_status.fit_transform),
    pl.col("rent_or_own").map_batches(le_rent_or_own.fit_transform),
    pl.col("employment_status").map_batches(le_employment_status.fit_transform),
    pl.col("hhs_geo_region").map_batches(le_hhs_geo_region.fit_transform),
    pl.col("census_msa").map_batches(le_census_msa.fit_transform),
)
fts_null.collect().write_parquet("ETL/fts_null.parquet")
le_fts_null = {"le_age_group": le_age_group.classes_.tolist(), "le_education": le_education.classes_.tolist(),"le_race": le_race.classes_.tolist(),"le_sex": le_sex.classes_.tolist(),
                 "le_income_poverty": le_income_poverty.classes_.tolist(), "le_marital_status": le_marital_status.classes_.tolist(),
                 "le_rent_or_own": le_rent_or_own.classes_.tolist(), "le_employment_status": le_employment_status.classes_.tolist(),
                 "le_hhs_geo_region": le_hhs_geo_region.classes_.tolist(), "le_census_msa": le_census_msa.classes_.tolist()}


In [11]:
with open("ETL/le_fts_null.json", "w") as writefile:
    writefile.write(json.dumps(le_fts_null))