In [1]:
from sklearn.model_selection import KFold
from joblib import dump, load
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [2]:
pred_cols = ["Mean_BMI","Median_BMI","Unmet_Need_Rate","Under5_Mortality_Rate","Skilled_Birth_Attendant_Rate","Stunted_Rate"]

In [3]:
def mcrmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return np.average(rmse)

In [4]:
train = pd.read_parquet('../data/train.parquet.gzip')
train = train[~train.index.duplicated(keep='first')]
X = train.drop(pred_cols, axis=1)
y = train[pred_cols]

In [5]:
import gc
del(train)
gc.collect()

0

In [6]:
low_imp_features = load('../data/low_imp_features.joblib')
# drop the features with low importance and return a new dataframe
X_new = X.drop(columns=low_imp_features)
X_new

Unnamed: 0_level_0,Es_min_max@CAS/IGSNRR/PML/V2&timestamped,basic_demographic_characteristics_min_max@CIESIN/GPWv411/GPW_Basic_Demographic_Characteristics&timestamped,national_identifier_grid@CIESIN/GPWv411/GPW_National_Identifier_Grid,national_identifier_grid_max@CIESIN/GPWv411/GPW_National_Identifier_Grid,national_identifier_grid_min@CIESIN/GPWv411/GPW_National_Identifier_Grid,urban-coverfraction_mean@COPERNICUS/Landcover/100m/Proba-V-C3/Global&timestamped,SO2_column_number_density_15km_max_max@COPERNICUS/S5P/NRTI/L3_SO2&timestamped,solar_azimuth_angle_max_max@COPERNICUS/S5P/NRTI/L3_SO2&timestamped,ozone_tropospheric_mixing_ratio_median@COPERNICUS/S5P/OFFL/L3_O3_TCL&timestamped,ozone_tropospheric_vertical_column_median@COPERNICUS/S5P/OFFL/L3_O3_TCL&timestamped,...,onehotencoder__DHSCC_DR,onehotencoder__DHSCC_ET,onehotencoder__DHSCC_HT,onehotencoder__DHSCC_MZ,onehotencoder__DHSCC_TZ,onehotencoder__URBAN_RURA_R,onehotencoder__URBAN_RURA_U,remainder__DHSYEAR,remainder__LATNUM,remainder__LONGNUM
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AL200800000001,0.079005,0.533688,8.0,8.0,8.0,15.942918,0.001929,179.557310,21.800858,-0.994597,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2008.0,40.822650,19.838322
AL200800000002,0.090663,0.165650,8.0,8.0,8.0,1.059678,0.001774,179.592150,21.800858,-0.994597,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2008.0,40.696846,20.007555
AL200800000004,0.085710,0.814697,8.0,8.0,8.0,10.031118,0.001929,179.768220,21.800858,-0.994597,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2008.0,40.798930,19.863338
AL200800000010,0.105490,8.942595,8.0,8.0,8.0,22.716291,0.001986,179.742550,21.800858,-0.994597,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2008.0,40.698520,19.950300
AL200800000011,0.106758,8.940361,8.0,8.0,8.0,41.684960,0.001609,179.742550,21.800858,-0.994597,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2008.0,40.717968,19.935875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZW201500000395,0.007476,0.000000,716.0,716.0,716.0,0.000000,0.000404,-18.019940,51.753593,0.011510,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2015.0,-17.166506,29.718370
ZW201500000396,0.002424,0.000000,716.0,716.0,716.0,80.522360,0.000402,-17.357134,53.740124,0.011052,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2015.0,-17.915289,31.156115
ZW201500000397,0.038481,0.000000,716.0,716.0,716.0,0.298818,0.000336,-16.403381,53.757790,0.010933,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2015.0,-18.379501,31.872288
ZW201500000398,0.002700,0.000000,716.0,716.0,716.0,0.000000,0.000681,-16.150686,53.238773,0.011527,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2015.0,-16.660612,29.850649


In [8]:
model = RandomForestRegressor(random_state=42, n_jobs=-1)

parameters = {
    "n_estimators":[8000, 8500, 9000],
    "max_depth": [20,22,25],
    "max_features" : [0.45,0.55,0.6]
}

curr_best_score = 100000.0
best_params = {}
# iterate over all combinations of parameters
for v1 in parameters['max_depth']:
    for v2 in parameters['n_estimators']:
        for v3 in parameters['max_features']:
            model.set_params(max_depth=v1, n_estimators=v2, max_features=v3)
            scores = []
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            for train_index, test_index in kf.split(X_new):
                X_train, X_test = X_new.iloc[train_index], X_new.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                scores.append(mcrmse(y_test, y_pred))
            print("max_depth: {}, n_estimators: {}, max_features: {}, mcrmse: {}".format(v1, v2, v3, np.average(scores)))
            if(np.average(scores) < curr_best_score):
                curr_best_score = np.average(scores)
                best_params = {"max_depth":v1, "n_estimators":v2, "max_features":v3}

KeyboardInterrupt: 

In [None]:
# dump(model, '/kaggle/input/mchmlmic-new/test_rf_new.joblib')

In [None]:
test = pd.read_parquet('../data/test.parquet.gzip')
test = test.drop(columns=low_imp_features)
test

In [None]:
model = cv.best_estimator_
y_pred = model.predict(test)

In [None]:
out = pd.DataFrame(y_pred, columns=pred_cols)
out['DHSID'] = test.index
out = out[['DHSID'] + pred_cols]
out.to_csv('../submission/new_test.csv', index=False)

In [None]:
out