In [None]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import gc
import pandas as pd

In [None]:
main_path = '../data/main_data/'
other_path = '../data/other_data/'
sub_path = '../data/sample submission.csv'
nulls_path = '../data/nulls.joblib'
label_path = '../data/training_label.csv'
pred_cols = ["Mean_BMI","Median_BMI","Unmet_Need_Rate",
             "Under5_Mortality_Rate","Skilled_Birth_Attendant_Rate","Stunted_Rate"]

In [None]:
df = dd.read_parquet(main_path)

In [None]:
with ProgressBar():
    nulls = df.isnull().sum().compute()

In [None]:
df = df.drop(nulls[nulls>0].index, axis=1)
label = dd.read_csv(label_path, usecols=pred_cols+['DHSID'], dtype={'Stunted_Rate': 'float64'}).set_index('DHSID')

In [None]:
with ProgressBar():
    train = label.join(df, how='left')
    train = train.dropna()
    train = train.compute()
train

In [None]:
other_data = pd.read_parquet(other_path)
transformer = make_column_transformer(
    (OneHotEncoder(), ['DHSCC', 'DHSREGNA', 'URBAN_RURA']), remainder='passthrough')
transformed = transformer.fit_transform(other_data)
transformed_df = pd.DataFrame(transformed.todense(), columns=transformer.get_feature_names_out())
transformed_df.index = other_data.index

In [None]:
train = train.join(transformed_df, how='left')
train = train[~train.index.duplicated(keep='first')]
train.to_parquet('../data/train.parquet.gzip')

In [None]:
# Free up memory
del(train)
gc.collect()

In [None]:
sub = dd.read_csv(sub_path).set_index('DHSID')
test = sub.join(df, how='left')
with ProgressBar():
    test = test.compute()
test

In [None]:
test = test.join(transformed_df, how='left')
test = test.drop(pred_cols, axis=1)
test = test[~test.index.duplicated(keep='first')]
test.to_parquet('../data/test.parquet.gzip')