https://www.hackerearth.com/challenges/competitive/get-a-room-ml-hackathon/machine-learning/identify-the-habitability-score-of-a-property-12-464aae3e/

In [1]:
# imports

import os
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
dataset_path = Path("/home/tharun/projects/data_science_competitions/datasets/he_habitability/")
print([x.name for x in dataset_path.iterdir()])

['train_v2.csv', 'test_xgb_blend_final.csv', 'train.csv', 'train_v1.csv', 'subm_v1.csv', 'train_xgb_blend_final.csv', 'subm_v2.csv', 'sample_submission.csv', 'subm_v3.csv', 'test.csv', 'test_v1.csv']


In [3]:
results_path = Path("/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/blending_results")
results_path.as_posix()

'/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/blending_results'

In [4]:
train_df = pd.read_csv(dataset_path/"train_v2.csv")
test_df = pd.read_csv(dataset_path/"test_v1.csv")
sample_submission_df = pd.read_csv(dataset_path/"sample_submission.csv")

print(train_df.shape, test_df.shape)

(39500, 16) (10500, 14)


## utils

In [5]:
def make_sub_file(test_ids, preds):
    return pd.DataFrame.from_dict({"Property_ID": test_ids, "Habitability_score": preds})

## data preprocessing

In [6]:
useful_features = [col for col in train_df.columns if col not in ["Property_ID", "Habitability_score", "kfold"]]
useful_features

['Property_Type',
 'Property_Area',
 'Number_of_Windows',
 'Number_of_Doors',
 'Furnishing',
 'Frequency_of_Powercuts',
 'Power_Backup',
 'Water_Supply',
 'Traffic_Density_Score',
 'Crime_Rate',
 'Dust_and_Noise',
 'Air_Quality_Index',
 'Neighborhood_Review']

## model build - hyperparameter tuning

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
import optuna

from sklearn.metrics import r2_score, mean_squared_error

In [8]:
cat_cols = ["Property_Type", "Furnishing", "Power_Backup", "Water_Supply", "Crime_Rate", "Dust_and_Noise"]
num_cols = ["Property_Area", "Number_of_Windows", "Number_of_Doors", "Frequency_of_Powercuts", "Traffic_Density_Score", "Air_Quality_Index", "Neighborhood_Review"]

In [9]:
ct = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
])

In [12]:
def objective(trial):
    scores = []
    
    for fold in range(5):
        criterion = trial.suggest_categorical("criterion", ["squared_error", "absolute_error", "poisson"])
        max_depth = trial.suggest_int("max_depth", 2, 15)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 5)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 3)
        max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", 1.])
        max_samples = trial.suggest_float("max_samples", 0.1, 1.)
        
        xtrain = train_df[train_df.kfold != fold].reset_index(drop=True)
        xvalid = train_df[train_df.kfold == fold].reset_index(drop=True)
        
        ytrain = xtrain.loc[:, "Habitability_score"]
        yvalid = xvalid.loc[:, "Habitability_score"]
        
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        ct = ColumnTransformer([
            ("num", StandardScaler(), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
        ])
        
        xtrain = ct.fit_transform(xtrain)
        xvalid = ct.transform(xvalid)
        
        model = RandomForestRegressor(random_state=13,
                                      n_estimators=500,
                                      criterion=criterion,
                                      max_features=max_features,
                                      min_samples_leaf=min_samples_leaf,
                                      min_samples_split=min_samples_split,
                                      max_depth=max_depth,
                                      max_samples=max_samples
                                      )
        model.fit(xtrain, ytrain)
        preds_valid = model.predict(xvalid)
        scores.append(max(0, 100 * r2_score(yvalid, preds_valid)))
    
    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

[32m[I 2022-08-16 12:23:26,933][0m A new study created in memory with name: no-name-e18ddbda-ac0f-4297-8b69-ec3626c1fd09[0m
[32m[I 2022-08-16 12:23:40,653][0m Trial 0 finished with value: 77.83080978482191 and parameters: {'criterion': 'squared_error', 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_samples': 0.21783416276834527}. Best is trial 0 with value: 77.83080978482191.[0m
[32m[I 2022-08-16 12:23:47,927][0m Trial 1 finished with value: 41.247115212199404 and parameters: {'criterion': 'squared_error', 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_samples': 0.48005140416924164}. Best is trial 0 with value: 77.83080978482191.[0m
[32m[I 2022-08-16 12:23:55,861][0m Trial 2 finished with value: 34.192389569791246 and parameters: {'criterion': 'poisson', 'max_depth': 2, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_samples': 0.5324891886325474}. Best is tr