https://www.hackerearth.com/challenges/competitive/get-a-room-ml-hackathon/machine-learning/identify-the-habitability-score-of-a-property-12-464aae3e/

In [1]:
# imports

import os
from pathlib import Path

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
dataset_path = Path("./dataset")
print([x.name for x in dataset_path.iterdir()])

['train.csv', 'train_v2.csv', 'train_v1.csv', 'test_v1.csv', 'test.csv', 'sample_submission.csv']


In [3]:
results_path = Path("./results")
results_path.as_posix()

'results'

In [4]:
train_df = pd.read_csv(dataset_path/"train_v2.csv")
test_df = pd.read_csv(dataset_path/"test_v1.csv")
sample_submission_df = pd.read_csv(dataset_path/"sample_submission.csv")

print(train_df.shape, test_df.shape)

(39500, 16) (10500, 14)


## utils

In [5]:
def make_sub_file(test_ids, preds):
    return pd.DataFrame.from_dict({"Property_ID": test_ids, "Habitability_score": preds})

## data preprocessing

In [6]:
useful_features = [col for col in train_df.columns if col not in ["Property_ID", "Habitability_score", "kfold"]]
useful_features

['Property_Type',
 'Property_Area',
 'Number_of_Windows',
 'Number_of_Doors',
 'Furnishing',
 'Frequency_of_Powercuts',
 'Power_Backup',
 'Water_Supply',
 'Traffic_Density_Score',
 'Crime_Rate',
 'Dust_and_Noise',
 'Air_Quality_Index',
 'Neighborhood_Review']

## model build - hyperparameter tuning

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor
import optuna

from sklearn.metrics import r2_score, mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
cat_cols = ["Property_Type", "Furnishing", "Power_Backup", "Water_Supply", "Crime_Rate", "Dust_and_Noise"]
num_cols = ["Property_Area", "Number_of_Windows", "Number_of_Doors", "Frequency_of_Powercuts", "Traffic_Density_Score", "Air_Quality_Index", "Neighborhood_Review"]

In [9]:
ct = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
])

In [16]:
def objective(trial):
    scores = []
    
    for fold in range(5):
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.)
        subsample = trial.suggest_float("subsample", 0.1, 1.)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.)
        max_depth = trial.suggest_int("max_depth", 1, 7)
        
        xtrain = train_df[train_df.kfold != fold].reset_index(drop=True)
        xvalid = train_df[train_df.kfold == fold].reset_index(drop=True)
        
        ytrain = xtrain.loc[:, "Habitability_score"]
        yvalid = xvalid.loc[:, "Habitability_score"]
        
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        ct = ColumnTransformer([
            ("num", StandardScaler(), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
        ])
        
        xtrain = ct.fit_transform(xtrain)
        xvalid = ct.transform(xvalid)
        
        model = XGBRegressor(random_state=13,
                             tree_method="gpu_hist",
                             gpu_id=0,
                             predictor="gpu_predictor",
                             n_estimators=11000,
                             learning_rate=learning_rate,
                             reg_lambda=reg_lambda,
                             reg_alpha=reg_alpha,
                             subsample=subsample,
                             colsample_bytree=colsample_bytree,
                             max_depth=max_depth,
                             early_stopping_rounds=300)
        model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], verbose=False)
        preds_valid = model.predict(xvalid)
        scores.append(mean_squared_error(yvalid, preds_valid))
    
    return np.mean(scores)

In [18]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

[32m[I 2022-08-20 17:13:35,314][0m A new study created in memory with name: no-name-b3ebd86d-5649-4bac-afca-291ec058a0aa[0m
[32m[I 2022-08-20 17:13:52,819][0m Trial 0 finished with value: 38.519805764329725 and parameters: {'learning_rate': 0.084751570090863, 'reg_lambda': 0.038421236249175206, 'reg_alpha': 2.3793977777560106e-08, 'subsample': 0.9833497591686431, 'colsample_bytree': 0.43409076591420337, 'max_depth': 7}. Best is trial 0 with value: 38.519805764329725.[0m
[32m[I 2022-08-20 17:14:08,189][0m Trial 1 finished with value: 43.47266628019754 and parameters: {'learning_rate': 0.051897898553999, 'reg_lambda': 1.4570606839004645, 'reg_alpha': 0.004449534502120412, 'subsample': 0.15073677859004608, 'colsample_bytree': 0.4843997172143105, 'max_depth': 4}. Best is trial 0 with value: 38.519805764329725.[0m
[32m[I 2022-08-20 17:15:02,458][0m Trial 2 finished with value: 52.851492974458 and parameters: {'learning_rate': 0.010284286792749142, 'reg_lambda': 9.602532992936622,

[32m[I 2022-08-20 17:25:53,753][0m Trial 23 finished with value: 36.37159060103258 and parameters: {'learning_rate': 0.012294152165910557, 'reg_lambda': 0.000713921750933471, 'reg_alpha': 1.6884696724666363e-07, 'subsample': 0.915522581607114, 'colsample_bytree': 0.5709590500123048, 'max_depth': 7}. Best is trial 23 with value: 36.37159060103258.[0m
[32m[I 2022-08-20 17:26:19,943][0m Trial 24 finished with value: 37.4394545847842 and parameters: {'learning_rate': 0.025076097898979344, 'reg_lambda': 0.009271267158532768, 'reg_alpha': 1.0381308768641429e-08, 'subsample': 0.9829031280676493, 'colsample_bytree': 0.5225183522820156, 'max_depth': 6}. Best is trial 23 with value: 36.37159060103258.[0m
[32m[I 2022-08-20 17:27:02,550][0m Trial 25 finished with value: 38.141284358898965 and parameters: {'learning_rate': 0.01054607287832101, 'reg_lambda': 0.00023355254255048546, 'reg_alpha': 1.2191213929145765e-07, 'subsample': 0.8971248469774057, 'colsample_bytree': 0.7474026864273043, '

[32m[I 2022-08-20 17:40:24,918][0m Trial 47 finished with value: 37.41823396398297 and parameters: {'learning_rate': 0.03856503724533874, 'reg_lambda': 0.37705251857643673, 'reg_alpha': 7.172875181447021e-06, 'subsample': 0.7798568067289212, 'colsample_bytree': 0.6434006011578239, 'max_depth': 6}. Best is trial 32 with value: 36.17957876448234.[0m
[32m[I 2022-08-20 17:41:20,396][0m Trial 48 finished with value: 36.65451390300442 and parameters: {'learning_rate': 0.016724778266428535, 'reg_lambda': 55.134251375527334, 'reg_alpha': 2.9244205917277154e-08, 'subsample': 0.867318511124328, 'colsample_bytree': 0.6952666312386475, 'max_depth': 7}. Best is trial 32 with value: 36.17957876448234.[0m
[32m[I 2022-08-20 17:42:00,575][0m Trial 49 finished with value: 37.86126217892952 and parameters: {'learning_rate': 0.02189343077576872, 'reg_lambda': 2.0170021759579417e-05, 'reg_alpha': 1.2427841053551338e-06, 'subsample': 0.6620642977403535, 'colsample_bytree': 0.4588086249794607, 'max_d

[32m[I 2022-08-20 17:55:32,914][0m Trial 71 finished with value: 36.305091323581266 and parameters: {'learning_rate': 0.012052904447991487, 'reg_lambda': 0.08778392289424465, 'reg_alpha': 4.982173484134771e-05, 'subsample': 0.4978630537850699, 'colsample_bytree': 0.8019137366070102, 'max_depth': 7}. Best is trial 56 with value: 35.86403387827121.[0m
[32m[I 2022-08-20 17:56:17,124][0m Trial 72 finished with value: 35.97705487714448 and parameters: {'learning_rate': 0.010025037283029765, 'reg_lambda': 0.04074198988056894, 'reg_alpha': 9.29364070063038e-06, 'subsample': 0.5790900551617592, 'colsample_bytree': 0.8982758541237553, 'max_depth': 7}. Best is trial 56 with value: 35.86403387827121.[0m
[32m[I 2022-08-20 17:56:50,764][0m Trial 73 finished with value: 36.02729681760337 and parameters: {'learning_rate': 0.013180312294354655, 'reg_lambda': 0.029240505873277773, 'reg_alpha': 7.196519943913401e-06, 'subsample': 0.5441041757221398, 'colsample_bytree': 0.9099272553308616, 'max_d

[32m[I 2022-08-20 18:09:59,221][0m Trial 95 finished with value: 35.79969072500958 and parameters: {'learning_rate': 0.010022943649539364, 'reg_lambda': 0.0035509952679005476, 'reg_alpha': 1.3331857872250535e-07, 'subsample': 0.7901016619882593, 'colsample_bytree': 0.9670976537445786, 'max_depth': 7}. Best is trial 95 with value: 35.79969072500958.[0m
[32m[I 2022-08-20 18:10:32,325][0m Trial 96 finished with value: 36.70939403195739 and parameters: {'learning_rate': 0.010993659973276086, 'reg_lambda': 0.012096937534467243, 'reg_alpha': 1.0135619508095019e-07, 'subsample': 0.8312056587133623, 'colsample_bytree': 0.9760482599372288, 'max_depth': 6}. Best is trial 95 with value: 35.79969072500958.[0m
[32m[I 2022-08-20 18:11:11,332][0m Trial 97 finished with value: 35.823375951361655 and parameters: {'learning_rate': 0.012055596707440922, 'reg_lambda': 0.002889952537191501, 'reg_alpha': 5.086924758989532e-08, 'subsample': 0.7996721941987092, 'colsample_bytree': 0.8836108071615677, 

In [19]:
study.best_params

{'learning_rate': 0.010022943649539364,
 'reg_lambda': 0.0035509952679005476,
 'reg_alpha': 1.3331857872250535e-07,
 'subsample': 0.7901016619882593,
 'colsample_bytree': 0.9670976537445786,
 'max_depth': 7}

In [20]:
ct = ColumnTransformer([
            ("num", StandardScaler(), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
        ])

In [21]:
test_scores = []
final_valid_preds = {}

for fold in range(5):
    xtrain = train_df[train_df.kfold != fold].reset_index(drop=True)
    xvalid = train_df[train_df.kfold == fold].reset_index(drop=True)
    xtest = test_df.copy()
    
    ytrain = xtrain.Habitability_score
    yvalid = xvalid.Habitability_score
    valid_ids = xvalid.Property_ID
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    xtest = xtest[useful_features]
    
    xtrain = ct.fit_transform(xtrain)
    xvalid = ct.transform(xvalid)
    xtest = ct.transform(xtest)
    
    model = XGBRegressor(random_state=13, n_estimators=7000, early_stopping_rounds=300, **study.best_params)
    model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], verbose=3000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    test_scores.append(test_preds)
    print(f"fold: {fold}, score: {max(0, 100 * r2_score(yvalid, valid_preds))}")
    
    # save valid preds
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds, orient="index").reset_index()
final_valid_preds.columns = ["Property_ID", "pred_normal"]
final_valid_preds.to_csv(results_path/"xgb_valid_pred_1.csv", index=False)

[0]	validation_0-rmse:73.61825




[1689]	validation_0-rmse:5.86970
fold: 0, score: 82.23849493705758
[0]	validation_0-rmse:73.66002




[2454]	validation_0-rmse:5.97195
fold: 1, score: 81.90114314868508
[0]	validation_0-rmse:73.43671




[1757]	validation_0-rmse:5.91833
fold: 2, score: 82.95664505656029
[0]	validation_0-rmse:73.70668




[1982]	validation_0-rmse:6.11540
fold: 3, score: 81.27873223044335
[0]	validation_0-rmse:73.55989




[1975]	validation_0-rmse:6.06268
fold: 4, score: 81.76463015487047


In [22]:
test_scores = np.column_stack(test_scores)
test_scores = np.mean(test_scores, axis=1)
print(test_scores.shape, test_scores[:5])

(10500,) [28.535273 80.17297  66.95925  72.50698  77.72035 ]


In [23]:
subm_df = make_sub_file(test_df.Property_ID, test_scores)
subm_df.head()

Unnamed: 0,Property_ID,Habitability_score
0,0x6e93,28.535273
1,0x8787,80.172974
2,0x6c17,66.959251
3,0x9dbd,72.506981
4,0xbfde,77.720352


In [24]:
subm_df.to_csv(dataset_path/"subm_v3.csv", index=False)
subm_df.to_csv(results_path/"xgb_test_pred_1.csv", index=False)