In [None]:
!pip3 install 'mljar-supervised'

In [33]:
# import required libraries
import pandas as pd
import numpy as np
import gc
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from joblib import dump, load
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import KNNImputer

# models
from supervised.automl import AutoML
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import xgboost as xgb

# metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Set the paths to data and the random_seed

In [34]:
input_path = '/kaggle/input/maternal-and-child-health-monitoring-in-lmics/gee_features.csv'
input_subset_path = '/kaggle/input/maternal-and-child-health-monitoring-in-lmics/gee_features_10pct.csv'
label_path = '../data/training_label.csv'
sub_path = '../data/sample submission.csv'

main_path = '../data/main_data/'
other_path = '../data/other_data/'

train_path = '../data/train.parquet.gzip'
test_path = '../data/test.parquet.gzip'
low_imp_features_path = '../data/low_imp_features.joblib'
submission_path = '../submission/'
if not os.path.exists(submission_path):
    os.mkdir(submission_path)

pred_cols = ["Mean_BMI","Median_BMI","Unmet_Need_Rate","Under5_Mortality_Rate",
             "Skilled_Birth_Attendant_Rate","Stunted_Rate"]

IMPORTANCE_THRESHOLD = 0.001
CORRELATION_THRESHOLD = 0.7
IMPUTATION_NUM_NEIGHBORS = 75 # found experimentally
RANDOM_STATE = 42

In [35]:
# Error metric
def mcrmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
    rmse = np.sqrt(mse)
    return np.average(rmse)

# Read data and create train and test files

In [None]:
with ProgressBar():
    # read the CSV and discard the columns that are not required
    df = dd.read_csv(input_path, sample=1000000, usecols = lambda x: x not in 
                    ['new_ind', 'index', 'ADM1DHS', 'ADM1FIPS', 'ADM1FIPSNA', 'ADM1NAME', 
                    'ADM1SALBCO', 'ADM1SALBNA', 'ADM1SALNA', 'ALT_DEM', 'ALT_GPS', 'CCFIPS',
                    'DATUM', 'F21', 'F22', 'F23', 'ZONECO', 'ZONENA', 'DHSCLUST', 'DHSREGCO', 'SOURCE',
                    'key1', 'key2', 'key3'], dtype={'DHSID': 'str'}).set_index('DHSID')

In [None]:
# Saving to parquet allows faster loading of data
imp_cols = df.columns[:6]
df1 = df[imp_cols]
df1.to_parquet(other_path) # save other data (non predictive features)

df = df.drop(imp_cols, axis=1)
df.to_parquet(main_path) # save the main data

In [None]:
# Free up memory
del(df1)
gc.collect()

In [None]:
label = dd.read_csv(label_path, usecols=pred_cols+['DHSID'], dtype={'Stunted_Rate': 'float64'}).set_index('DHSID')
sub = dd.read_csv(sub_path).set_index('DHSID')

In [None]:
# compute columns with null values
with ProgressBar():
    nulls = df.isnull().sum().compute()
df = df.drop(nulls[nulls>0].index, axis=1) # drop null columns

In [None]:
with ProgressBar():
    train = label.join(df, how='left')
    train = train.dropna()
    train = train.compute()
train

In [None]:
# comment this cell out if you want to use the entire gee_features.csv for training
# use only the indexes in gee_features_10pct.csv
use_idx = pd.read_csv(input_subset_path, usecols=['DHSID'])
train_idx = list(set(train.index) & set(use_idx['DHSID'].values))
train = train.loc[train_idx].reset_index()
train = train.set_index('DHSID')

In [None]:
# one hot encode the categorical features
other_data = pd.read_parquet(other_path)
transformer = make_column_transformer(
    (OneHotEncoder(), ['DHSCC', 'DHSREGNA', 'URBAN_RURA']), remainder='passthrough')
transformed = transformer.fit_transform(other_data)
transformed_df = pd.DataFrame(transformed.todense(), columns=transformer.get_feature_names_out())
transformed_df.index = other_data.index

In [None]:
train = train.join(transformed_df, how='left')
train = train[~train.index.duplicated(keep='first')] # drop duplicate rows
train.to_parquet(train_path)

In [None]:
# Free up memory
del(train)
gc.collect()

In [None]:
test = sub.join(df, how='left')
with ProgressBar():
    test = test.compute()
test

In [None]:
# comment this cell out if you want to use the entire sample submission for testing
# only use indexes present in both gee_features_10pct and sample submission
test_idx = list(set(test.index) & set(use_idx['DHSID'].values))
test = test.loc[test_idx].reset_index()
test = test.set_index('DHSID')
test

In [None]:
test = test.join(transformed_df, how='left')
test = test.drop(pred_cols, axis=1)
test = test[~test.index.duplicated(keep='first')] # drop duplicate rows

# impute test values (25 rows have missing data)
imputer = KNNImputer(n_neighbors=IMPUTATION_NUM_NEIGHBORS)
test = pd.DataFrame(imputer.fit_transform(test), columns=test.columns, index=test.index)

test.to_parquet(test_path)

# Find most important features

In [None]:
train = pd.read_parquet(train_path)
train

In [None]:
y = train[pred_cols]
X = train.drop(pred_cols, axis=1)

In [None]:
# model parameters found through manual fine tuning
model = RandomForestRegressor(verbose=1, random_state=RANDOM_STATE, n_estimators=700, max_features=0.6, n_jobs=-1)
model.fit(X, y)

In [None]:
feature_imp = pd.DataFrame({'names': model.feature_names_in_,
                            'imp': model.feature_importances_})
feature_imp

In [None]:
low_imp_features = feature_imp[feature_imp['imp'] <= IMPORTANCE_THRESHOLD]['names'].values
X = X.drop(low_imp_features, axis=1)

In [38]:
# From among the important features, remove the correlated ones
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > CORRELATION_THRESHOLD)]

low_imp_features = np.append(low_imp_features, to_drop)
dump(low_imp_features, low_imp_features_path)

['../data/low_imp_features.joblib']

In [None]:
# Free up memory
del(model)
del(train)
gc.collect()

# Load Data

In [None]:
low_imp_features = load(low_imp_features_path)
train = pd.read_parquet(train_path)
X = train.drop(pred_cols, axis=1)
X = X.drop(columns=low_imp_features, axis=1)
y = train[pred_cols]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
test = pd.read_parquet(test_path)
test = test.drop(columns=low_imp_features, axis=1)
test

# Train MLJar model in order to decide best models to use

In [None]:
automl_sub = pd.DataFrame()
# automl_val_sub = pd.DataFrame()

for i, pcol in enumerate(pred_cols):
    print(f'[STARTED] ({i+1}/{len(pred_cols)}) LABEL: {pcol}')
    y_pcol = y[pcol]
    y_pcol = y_pcol[~y_pcol.isna()]
    
    X_new = pd.merge(X, y_pcol, on='DHSID', how='inner')
    
    X_pcol = X_new.drop(pcol, axis = 1)
    y_pcol = X_new[pcol]
    
    automl = AutoML(mode="Compete", random_state=RANDOM_STATE, explain_level=0, n_jobs=-1)
    automl.fit(X_pcol, y_pcol)
    
    # sub[pcol] = sub[pcol] * 0.6 + hgbr.predict(test) * 0.4
    automl_sub[pcol] = automl.predict(test)
    # automl_val_sub[pcol] = automl.predict(X_test)

    print(f'[COMPLETED] ({i+1}/{len(pred_cols)}) LABEL: {pcol}')
    
    del(automl)
    gc.collect()

# Create a pipeline of models trained on error

In [5]:
train = pd.read_parquet(train_path)
low_imp_features = load(low_imp_features_path)
train.drop(columns=low_imp_features, inplace=True)
train

Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate,Es_min_max@CAS/IGSNRR/PML/V2&timestamped,basic_demographic_characteristics_min_max@CIESIN/GPWv411/GPW_Basic_Demographic_Characteristics&timestamped,national_identifier_grid@CIESIN/GPWv411/GPW_National_Identifier_Grid,national_identifier_grid_max@CIESIN/GPWv411/GPW_National_Identifier_Grid,...,onehotencoder__DHSCC_DR,onehotencoder__DHSCC_ET,onehotencoder__DHSCC_HT,onehotencoder__DHSCC_MZ,onehotencoder__DHSCC_TZ,onehotencoder__URBAN_RURA_R,onehotencoder__URBAN_RURA_U,remainder__DHSYEAR,remainder__LATNUM,remainder__LONGNUM
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AL200800000017,22.97,22.16,0.00,12.96,100.00,50.00,0.095504,0.108678,8.0,8.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2008.0,41.538258,20.274292
AL200800000030,23.02,22.48,18.18,6.45,100.00,0.00,0.076007,0.410152,8.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2008.0,41.502155,20.214310
AL200800000036,24.18,24.45,0.00,0.00,100.00,0.00,0.000000,0.300010,8.0,8.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2008.0,40.642326,20.956264
AL200800000049,22.91,22.82,10.00,16.22,28.57,16.67,0.027352,8.749317,8.0,8.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2008.0,41.672134,20.434908
AL200800000055,25.14,23.00,7.69,0.00,50.00,33.33,0.024255,8.749317,8.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2008.0,41.683105,20.425337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZW201500000256,21.37,21.00,10.00,13.04,69.23,17.65,0.005765,0.000000,716.0,716.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2015.0,-17.620686,32.575363
ZW201500000276,23.16,22.63,0.00,4.17,57.14,20.00,0.000000,0.000000,716.0,716.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2015.0,-18.288078,30.565697
ZW201500000310,21.53,21.46,40.00,6.45,48.00,32.00,0.000000,0.000000,716.0,716.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2015.0,-20.779175,31.674726
ZW201500000342,24.26,24.24,0.00,8.89,60.00,43.75,0.026485,0.000000,716.0,716.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2015.0,-18.239857,32.094963


In [6]:
y = train[pred_cols]
X = train.drop(pred_cols, axis=1)

In [13]:
# define models to be used
model_dict = {}

# 2 RF models
model_dict['rf1'] = RandomForestRegressor(verbose=1, random_state=RANDOM_STATE, n_jobs=-1)
model_dict['rf2'] = RandomForestRegressor(verbose=1, random_state=RANDOM_STATE, n_jobs=-1)
# 6 LGBM models
for col in pred_cols:
    model_dict[f'lgbm_{col}'] = lgb.LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1)
# 1 XGB model
model_dict['xgb1'] = xgb.XGBRegressor(random_state=RANDOM_STATE, n_jobs=-1)

In [20]:
model_dict['rf1'].fit(X, y)
y1 = (y - model_dict['rf1'].predict(X)).copy(deep = True)
y1

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000017,-0.5268,-0.6689,-9.1352,3.3714,12.5265,12.5952
AL200800000030,-0.9117,-1.0315,-5.3607,1.3966,6.0156,-12.9762
AL200800000036,-0.0375,0.2813,-8.6349,-1.8295,8.0401,-7.8071
AL200800000049,-0.5668,-0.4171,-9.1379,4.9973,-17.8147,-2.5876
AL200800000055,-0.1996,-0.6600,-17.8068,-0.2975,0.5176,5.7085
...,...,...,...,...,...,...
ZW201500000256,-0.3216,-0.2462,-5.2640,2.5778,2.2149,-7.1953
ZW201500000276,-0.0149,-0.0151,-5.4624,-0.7344,-7.8752,-2.6954
ZW201500000310,-0.4437,-0.3141,3.6503,-0.2939,-2.5310,-2.1916
ZW201500000342,0.0492,0.2541,-5.7534,-0.0175,-1.1638,4.5171


In [21]:
y2 = pd.DataFrame()
for col in pred_cols:
    model_dict[f'lgbm_{col}'].fit(X, y1[col])
    y2[col] = (y1[col] - model_dict[f'lgbm_{col}'].predict(X)).copy(deep=True)
y2

Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000017,-0.274949,-0.407166,-3.477802,2.265123,6.013902,4.282078
AL200800000030,-0.624034,-0.845337,-2.036387,0.669412,0.529072,-6.772463
AL200800000036,0.057000,0.509150,-3.749703,-1.542804,3.205270,-3.082792
AL200800000049,-0.238029,-0.154289,-5.724695,3.561441,-10.733027,-2.420710
AL200800000055,0.133752,-0.208536,-12.624439,-0.677992,6.599503,4.622338
...,...,...,...,...,...,...
ZW201500000256,-0.229073,-0.115616,-1.862026,2.171207,-0.634795,-2.492237
ZW201500000276,-0.017332,-0.026112,-3.068841,-0.574844,-9.399763,0.081197
ZW201500000310,-0.306156,-0.181518,5.502621,0.368072,-1.740492,-1.551250
ZW201500000342,-0.032715,-0.105186,-2.773545,0.378806,-0.343299,2.497173


In [22]:
model_dict['xgb1'].fit(X, y2)
y3 = (y2 - model_dict['xgb1'].predict(X)).copy(deep=True)
y3

Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000017,-0.098267,-0.138867,-0.674538,0.694920,3.621891,0.913203
AL200800000030,-0.354531,-0.424533,-1.485514,0.077268,-0.755077,-1.535490
AL200800000036,0.180215,0.375277,-1.758262,-0.855971,2.926349,-1.708782
AL200800000049,-0.091232,-0.021130,-0.151996,1.212171,-0.056160,-0.388357
AL200800000055,0.094223,-0.065116,-4.816310,-0.316870,2.088949,1.752859
...,...,...,...,...,...,...
ZW201500000256,-0.173698,-0.160978,-0.197494,0.920231,-0.939515,-0.382930
ZW201500000276,-0.005235,-0.063588,-1.369392,-0.041633,-6.799103,0.275369
ZW201500000310,-0.252938,-0.116746,4.332861,0.180215,-1.129459,-1.206625
ZW201500000342,-0.039297,-0.116089,-0.978324,0.074830,0.715170,2.205443


In [23]:
model_dict['rf2'].fit(X, y3)
y4 = (y3 - model_dict['rf2'].predict(X)).copy(deep=True)
y4

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000017,-0.013417,-0.038376,-0.163919,0.288486,1.588939,0.265246
AL200800000030,-0.183545,-0.201917,-0.892050,0.050383,-0.472395,-0.686124
AL200800000036,0.090832,0.153009,-0.974847,-0.356703,1.052175,-0.505002
AL200800000049,-0.074645,-0.060737,-0.373467,0.539966,0.421025,0.102672
AL200800000055,0.050884,-0.027029,-1.813891,-0.292245,0.952809,0.792816
...,...,...,...,...,...,...
ZW201500000256,-0.064068,-0.042619,-0.170919,0.394497,-0.537679,-0.035889
ZW201500000276,-0.002546,-0.033507,-0.445136,0.035185,-2.299900,-0.005351
ZW201500000310,-0.159692,-0.134385,1.973633,0.031337,-0.461760,-0.351536
ZW201500000342,-0.022233,-0.036755,-0.329407,-0.030800,0.429434,0.666506


In [24]:
test = pd.read_parquet(test_path)
low_imp_features = load(low_imp_features_path)
test.drop(columns=low_imp_features, inplace=True)
test

Unnamed: 0_level_0,Es_min_max@CAS/IGSNRR/PML/V2&timestamped,basic_demographic_characteristics_min_max@CIESIN/GPWv411/GPW_Basic_Demographic_Characteristics&timestamped,national_identifier_grid@CIESIN/GPWv411/GPW_National_Identifier_Grid,national_identifier_grid_max@CIESIN/GPWv411/GPW_National_Identifier_Grid,national_identifier_grid_min@CIESIN/GPWv411/GPW_National_Identifier_Grid,urban-coverfraction_mean@COPERNICUS/Landcover/100m/Proba-V-C3/Global&timestamped,SO2_column_number_density_15km_max_max@COPERNICUS/S5P/NRTI/L3_SO2&timestamped,solar_azimuth_angle_max_max@COPERNICUS/S5P/NRTI/L3_SO2&timestamped,ozone_tropospheric_mixing_ratio_median@COPERNICUS/S5P/OFFL/L3_O3_TCL&timestamped,ozone_tropospheric_vertical_column_median@COPERNICUS/S5P/OFFL/L3_O3_TCL&timestamped,...,onehotencoder__DHSCC_DR,onehotencoder__DHSCC_ET,onehotencoder__DHSCC_HT,onehotencoder__DHSCC_MZ,onehotencoder__DHSCC_TZ,onehotencoder__URBAN_RURA_R,onehotencoder__URBAN_RURA_U,remainder__DHSYEAR,remainder__LATNUM,remainder__LONGNUM
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AL200800000003,0.103369,0.444329,8.00000,8.0,8.0,3.955272,0.001707,179.530600,21.800858,-0.994597,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2008.0,40.750040,19.974262
AL200800000005,0.096352,0.541894,8.00000,8.0,8.0,0.901964,0.001859,179.618840,21.800858,-0.994597,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2008.0,40.746124,19.843885
AL200800000007,0.098821,3.778325,8.00000,8.0,8.0,12.266235,0.001827,179.544780,21.800858,-0.994597,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2008.0,40.701607,19.989952
AL200800000008,0.105989,8.942595,8.00000,8.0,8.0,50.352406,0.001986,179.544780,21.800858,-0.994597,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2008.0,40.695984,19.965063
AL200800000009,0.101052,8.941481,8.00000,8.0,8.0,34.656734,0.001986,179.557740,21.800858,-0.994597,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2008.0,40.698685,19.981623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZW201500000382,0.012040,0.000000,716.00000,716.0,716.0,0.000000,0.000443,-15.686703,48.700848,0.010540,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2015.0,-19.939451,31.822948
ZW201500000383,0.000000,0.000000,716.00000,716.0,716.0,-1.000000,0.000312,-16.927180,49.692770,0.010532,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2015.0,-19.851952,28.240479
ZW201500000386,0.000000,0.000000,-19.36183,716.0,710.0,0.000000,0.000339,-16.038418,21.800858,-0.994597,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2015.0,-22.126260,29.639608
ZW201500000390,0.000359,0.000000,716.00000,716.0,716.0,84.699066,0.000501,-17.178234,53.740124,0.011052,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2015.0,-17.878082,31.033348


In [27]:
preds =  pd.DataFrame(model_dict['rf1'].predict(test), columns=pred_cols)
preds

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


Unnamed: 0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
0,24.9114,24.6913,19.4603,3.6059,87.5615,21.8919
1,24.6902,24.5045,16.8791,4.0502,86.7981,23.2724
2,24.9353,24.6159,19.3229,3.4244,88.6924,25.2448
3,24.8113,24.4940,21.9906,3.1113,81.2832,24.9113
4,24.9017,24.6255,25.9381,3.2377,81.5878,26.2608
...,...,...,...,...,...,...
14995,23.1107,22.6796,22.8139,7.0072,67.3092,32.4192
14996,24.2754,23.3344,28.0006,5.1017,74.7658,29.1850
14997,27.4711,27.1226,39.1695,3.4685,88.2611,26.3564
14998,24.5905,24.0457,13.4892,6.6499,90.3714,22.8766


In [28]:
for col in pred_cols:
    preds[col] = preds[col] + model_dict[f'lgbm_{col}'].predict(test)
preds

Unnamed: 0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
0,24.972866,25.042898,15.101037,2.970883,94.079820,25.054855
1,24.986148,24.707248,15.163706,3.538091,92.947142,28.423441
2,24.549603,24.432204,14.508880,3.464966,95.478526,23.861746
3,24.454616,24.482753,17.470083,2.873707,86.047327,21.206588
4,24.653867,24.670558,22.534328,3.119478,86.351927,23.722997
...,...,...,...,...,...,...
14995,23.013464,22.586485,21.853539,6.043822,71.475512,30.833538
14996,24.225025,23.369992,31.124069,4.724461,76.778349,26.141170
14997,27.281040,27.336364,39.882172,3.078248,90.000682,22.916211
14998,24.692579,24.273037,12.216770,6.899357,89.845862,22.021324


In [29]:
preds = preds + model_dict['xgb1'].predict(test)
preds

Unnamed: 0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
0,25.096067,24.819080,15.754020,3.011399,93.455974,25.782323
1,24.832233,24.233194,16.826073,2.994716,94.744874,30.247612
2,24.399313,23.870017,13.226793,2.871283,96.342351,21.637362
3,24.535311,24.261295,16.635746,3.220057,88.910449,21.722482
4,24.630802,24.165169,21.705151,3.243689,89.215049,27.265764
...,...,...,...,...,...,...
14995,22.757708,22.522341,22.444566,6.151807,72.703480,31.882236
14996,24.321268,23.261803,31.884604,5.198976,78.277929,25.877034
14997,27.275755,27.683991,39.741490,2.828869,91.630014,25.054479
14998,24.748685,24.348033,12.086316,6.738099,90.214159,22.414154


In [30]:
preds = preds + model_dict['rf2'].predict(test)
preds

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
0,25.079031,24.797873,15.835770,3.157825,93.939933,26.214609
1,24.826929,24.206917,17.028859,3.043030,95.613658,30.784774
2,24.366379,23.837751,13.575075,3.135699,96.610832,22.085178
3,24.468431,24.193798,16.991675,3.451371,89.468716,22.048143
4,24.572781,24.100198,22.179035,3.460923,89.650712,27.621590
...,...,...,...,...,...,...
14995,22.746167,22.580778,22.552179,6.197806,73.074744,31.797826
14996,24.311648,23.213622,32.266318,5.174810,78.029385,26.165323
14997,27.276337,27.702707,39.915346,2.860419,91.912232,24.886646
14998,24.655764,24.311403,12.935721,6.618597,89.938529,23.059235


In [31]:
preds['DHSID'] = test.index
preds = preds[['DHSID'] + pred_cols]
preds

Unnamed: 0,DHSID,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
0,AL200800000003,25.079031,24.797873,15.835770,3.157825,93.939933,26.214609
1,AL200800000005,24.826929,24.206917,17.028859,3.043030,95.613658,30.784774
2,AL200800000007,24.366379,23.837751,13.575075,3.135699,96.610832,22.085178
3,AL200800000008,24.468431,24.193798,16.991675,3.451371,89.468716,22.048143
4,AL200800000009,24.572781,24.100198,22.179035,3.460923,89.650712,27.621590
...,...,...,...,...,...,...,...
14995,ZW201500000382,22.746167,22.580778,22.552179,6.197806,73.074744,31.797826
14996,ZW201500000383,24.311648,23.213622,32.266318,5.174810,78.029385,26.165323
14997,ZW201500000386,27.276337,27.702707,39.915346,2.860419,91.912232,24.886646
14998,ZW201500000390,24.655764,24.311403,12.935721,6.618597,89.938529,23.059235


In [32]:
preds.to_csv('../submission/final_sub.csv', index=False)