In [None]:
from sklearn.model_selection import KFold
from joblib import dump, load
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [None]:
pred_cols = ["Mean_BMI","Median_BMI","Unmet_Need_Rate","Under5_Mortality_Rate","Skilled_Birth_Attendant_Rate","Stunted_Rate"]
!wget -O "test.parquet.gzip" "https://iitkgpacin-my.sharepoint.com/:u:/g/personal/rushilv_kgpian_iitkgp_ac_in/EXPYjAeWagtJkXj5Jq16m0MBjYsOb9GGXW2ZFMBwh0J9ZA?download=1"
!wget -O "train.parquet.gzip" "https://iitkgpacin-my.sharepoint.com/:u:/g/personal/rushilv_kgpian_iitkgp_ac_in/EbSDQP8WT9RIjIisg7PEa9UB7tuSf43J2_h3W0KgxK47ug?download=1"
!wget -O "low_imp_features.joblib" "https://iitkgpacin-my.sharepoint.com/:u:/g/personal/rushilv_kgpian_iitkgp_ac_in/EahAFwUUFahChNa0WX6C8-IBMgV0jy51QtCIAaGE3rdyXQ?download=1"

In [None]:
def mcrmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return np.average(rmse)

In [None]:
train = pd.read_parquet('./train.parquet.gzip')
train = train[~train.index.duplicated(keep='first')]
X = train.drop(pred_cols, axis=1)
y = train[pred_cols]

In [None]:
import gc
del(train)
gc.collect()

In [None]:
low_imp_features = load('./low_imp_features.joblib')
# drop the features with low importance and return a new dataframe
X_new = X.drop(columns=low_imp_features)
X_new

In [None]:
model = RandomForestRegressor(random_state=42, n_jobs=-1)

parameters = {
    "n_estimators":[8000, 8500, 9000],
    "max_depth": [20,22,25],
    "max_features" : [0.45,0.55,0.6]
}

curr_best_score = 100000.0
best_params = {}
# iterate over all combinations of parameters
for v1 in parameters['max_depth']:
    for v2 in parameters['n_estimators']:
        for v3 in parameters['max_features']:
            model.set_params(max_depth=v1, n_estimators=v2, max_features=v3)
            scores = []
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            for train_index, test_index in kf.split(X_new):
                X_train, X_test = X_new.iloc[train_index], X_new.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                scores.append(mcrmse(y_test, y_pred))
            print("max_depth: {}, n_estimators: {}, max_features: {}, mcrmse: {}".format(v1, v2, v3, np.average(scores)))
            if(np.average(scores) < curr_best_score):
                curr_best_score = np.average(scores)
                best_params = {"max_depth":v1, "n_estimators":v2, "max_features":v3}

In [None]:
# dump(model, '/kaggle/input/mchmlmic-new/test_rf_new.joblib')

In [None]:
model = RandomForestRegressor(best_params, random_state=42, n_jobs=-1)
model.fit(X_new, y)

In [None]:
test = pd.read_parquet('./test.parquet.gzip')
test = test.drop(columns=low_imp_features)
test

In [None]:
y_pred = model.predict(test)

In [None]:
out = pd.DataFrame(y_pred, columns=pred_cols)
out['DHSID'] = test.index
out = out[['DHSID'] + pred_cols]
out.to_csv('./new_test.csv', index=False)

In [None]:
out