https://www.hackerearth.com/challenges/competitive/get-a-room-ml-hackathon/machine-learning/identify-the-habitability-score-of-a-property-12-464aae3e/

In [2]:
# imports

import os
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [3]:
dataset_path = Path("/home/tharun/projects/data_science_competitions/datasets/he_habitability/")
print([x.name for x in dataset_path.iterdir()])

['train_v2.csv', 'train.csv', 'train_v1.csv', 'subm_v1.csv', 'subm_v2.csv', 'sample_submission.csv', 'test.csv', 'test_v1.csv']


In [73]:
results_path = Path("/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/blending_results")
results_path.as_posix()

'/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/blending_results'

In [4]:
train_df = pd.read_csv(dataset_path/"train_v2.csv")
test_df = pd.read_csv(dataset_path/"test_v1.csv")
sample_submission_df = pd.read_csv(dataset_path/"sample_submission.csv")

print(train_df.shape, test_df.shape)

(39500, 16) (10500, 14)


## utils

In [5]:
def make_sub_file(test_ids, preds):
    return pd.DataFrame.from_dict({"Property_ID": test_ids, "Habitability_score": preds})

## data preprocessing

In [6]:
useful_features = [col for col in train_df.columns if col not in ["Property_ID", "Habitability_score", "kfold"]]
useful_features

['Property_Type',
 'Property_Area',
 'Number_of_Windows',
 'Number_of_Doors',
 'Furnishing',
 'Frequency_of_Powercuts',
 'Power_Backup',
 'Water_Supply',
 'Traffic_Density_Score',
 'Crime_Rate',
 'Dust_and_Noise',
 'Air_Quality_Index',
 'Neighborhood_Review']

## model build - hyperparameter tuning

In [43]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor
import optuna

from sklearn.metrics import r2_score, mean_squared_error

In [8]:
cat_cols = ["Property_Type", "Furnishing", "Power_Backup", "Water_Supply", "Crime_Rate", "Dust_and_Noise"]
num_cols = ["Property_Area", "Number_of_Windows", "Number_of_Doors", "Frequency_of_Powercuts", "Traffic_Density_Score", "Air_Quality_Index", "Neighborhood_Review"]

In [9]:
ct = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
])

In [81]:
def objective(trial):
    scores = []
    
    for fold in range(5):
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.)
        subsample = trial.suggest_float("subsample", 0.1, 1.)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.)
        max_depth = trial.suggest_int("max_depth", 1, 7)
        
        xtrain = train_df[train_df.kfold != fold].reset_index(drop=True)
        xvalid = train_df[train_df.kfold == fold].reset_index(drop=True)
        
        ytrain = xtrain.loc[:, "Habitability_score"]
        yvalid = xvalid.loc[:, "Habitability_score"]
        
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        ct = ColumnTransformer([
            ("num", StandardScaler(), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
        ])
        
        xtrain = ct.fit_transform(xtrain)
        xvalid = ct.transform(xvalid)
        
        model = XGBRegressor(random_state=13,
                             tree_method="gpu_hist",
                             gpu_id=0,
                             predictor="gpu_predictor",
                             n_estimators=11000,
                             learning_rate=learning_rate,
                             reg_lambda=reg_lambda,
                             reg_alpha=reg_alpha,
                             subsample=subsample,
                             colsample_bytree=colsample_bytree,
                             max_depth=max_depth)
        model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=False)
        preds_valid = model.predict(xvalid)
        scores.append(max(0, 100 * r2_score(yvalid, preds_valid)))
    
    return np.mean(scores)

In [82]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[32m[I 2022-08-13 19:05:01,981][0m A new study created in memory with name: no-name-4db79f4c-1792-4531-a8a3-575622a7a6d2[0m
[32m[I 2022-08-13 19:05:47,974][0m Trial 0 finished with value: 79.54365753826991 and parameters: {'learning_rate': 0.01461442390345851, 'reg_lambda': 0.015503209625709337, 'reg_alpha': 5.766345824204118e-06, 'subsample': 0.6974626931906663, 'colsample_bytree': 0.4508465774265842, 'max_depth': 4}. Best is trial 0 with value: 79.54365753826991.[0m
[32m[I 2022-08-13 19:06:19,911][0m Trial 1 finished with value: 80.29435535644151 and parameters: {'learning_rate': 0.020293912423348236, 'reg_lambda': 3.479413048328403e-07, 'reg_alpha': 0.22631198855826626, 'subsample': 0.7226262563691809, 'colsample_bytree': 0.42125372599638367, 'max_depth': 5}. Best is trial 1 with value: 80.29435535644151.[0m
[32m[I 2022-08-13 19:06:52,767][0m Trial 2 finished with value: 80.18018573694144 and parameters: {'learning_rate': 0.027067112247632225, 'reg_lambda': 62.94530127270

In [83]:
study.best_params

{'learning_rate': 0.011052384287366561,
 'reg_lambda': 1.595786085571993e-07,
 'reg_alpha': 1.4830912184615704e-06,
 'subsample': 0.9105041719605118,
 'colsample_bytree': 0.9841150660618929,
 'max_depth': 7}

In [84]:
ct = ColumnTransformer([
            ("num", StandardScaler(), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
        ])

In [85]:
test_scores = []
final_valid_preds = {}

for fold in range(5):
    xtrain = train_df[train_df.kfold != fold].reset_index(drop=True)
    xvalid = train_df[train_df.kfold == fold].reset_index(drop=True)
    xtest = test_df.copy()
    
    ytrain = xtrain.Habitability_score
    yvalid = xvalid.Habitability_score
    valid_ids = xvalid.Property_ID
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    xtest = xtest[useful_features]
    
    xtrain = ct.fit_transform(xtrain)
    xvalid = ct.transform(xvalid)
    xtest = ct.transform(xtest)
    
    model = XGBRegressor(random_state=13, n_estimators=7000, **study.best_params)
    model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=3000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    test_scores.append(test_preds)
    print(f"fold: {fold}, score: {max(0, 100 * r2_score(yvalid, valid_preds))}")
    
    # save valid preds
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds, orient="index").reset_index()
final_valid_preds.columns = ["Property_ID", "pred_normal"]
final_valid_preds.to_csv(results_path/"xgb_valid_pred_1.csv", index=False)

[0]	validation_0-rmse:73.54119
[1717]	validation_0-rmse:5.86304
fold: 0, score: 82.28625189920538
[0]	validation_0-rmse:73.58345
[1979]	validation_0-rmse:5.98446
fold: 1, score: 81.81179035613894
[0]	validation_0-rmse:73.36105
[1754]	validation_0-rmse:5.92585
fold: 2, score: 82.90830611651742
[0]	validation_0-rmse:73.63058
[1668]	validation_0-rmse:6.11740
fold: 3, score: 81.25185617420708
[0]	validation_0-rmse:73.48405
[1920]	validation_0-rmse:6.07153
fold: 4, score: 81.70426843012044


In [86]:
test_scores = np.column_stack(test_scores)
test_scores = np.mean(test_scores, axis=1)
print(test_scores.shape, test_scores[:5])

(10500,) [28.60104  80.304886 67.08022  72.995995 78.20897 ]


In [87]:
subm_df = make_sub_file(test_df.Property_ID, test_scores)
subm_df.head()

Unnamed: 0,Property_ID,Habitability_score
0,0x6e93,28.60104
1,0x8787,80.304886
2,0x6c17,67.080223
3,0x9dbd,72.995995
4,0xbfde,78.208969


In [88]:
subm_df.to_csv(dataset_path/"subm_v3.csv", index=False)
subm_df.to_csv(results_path/"xgb_test_pred_1.csv", index=False)