https://www.hackerearth.com/challenges/competitive/get-a-room-ml-hackathon/machine-learning/identify-the-habitability-score-of-a-property-12-464aae3e/

In [1]:
# imports

import os
from pathlib import Path

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
dataset_path = Path("/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/dataset/")
print([x.name for x in dataset_path.iterdir()])

['train.csv', 'train_v2.csv', 'subm_v3.csv', 'train_v1.csv', 'test_v1.csv', 'test.csv', 'sample_submission.csv']


In [3]:
results_path = Path("/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/blending_results")
results_path.as_posix()

'/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/blending_results'

In [4]:
train_df = pd.read_csv(dataset_path/"train_v2.csv")
test_df = pd.read_csv(dataset_path/"test_v1.csv")
sample_submission_df = pd.read_csv(dataset_path/"sample_submission.csv")

print(train_df.shape, test_df.shape)

(39500, 16) (10500, 14)


## utils

In [5]:
def make_sub_file(test_ids, preds):
    return pd.DataFrame.from_dict({"Property_ID": test_ids, "Habitability_score": preds})

## data preprocessing

In [6]:
useful_features = [col for col in train_df.columns if col not in ["Property_ID", "Habitability_score", "kfold"]]
useful_features

['Property_Type',
 'Property_Area',
 'Number_of_Windows',
 'Number_of_Doors',
 'Furnishing',
 'Frequency_of_Powercuts',
 'Power_Backup',
 'Water_Supply',
 'Traffic_Density_Score',
 'Crime_Rate',
 'Dust_and_Noise',
 'Air_Quality_Index',
 'Neighborhood_Review']

## model build - hyperparameter tuning

In [25]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.ensemble import RandomForestRegressor
import optuna

from sklearn.metrics import r2_score, mean_squared_error

In [8]:
cat_cols = ["Property_Type", "Furnishing", "Power_Backup", "Water_Supply", "Crime_Rate", "Dust_and_Noise"]
num_cols = ["Property_Area", "Number_of_Windows", "Number_of_Doors", "Frequency_of_Powercuts", "Traffic_Density_Score", "Air_Quality_Index", "Neighborhood_Review"]

In [9]:
ct = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
])

In [23]:
def objective(trial, transform):
    scores = []

    for fold in range(5):
        max_depth = trial.suggest_int("max_depth", 2, 9)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 11)
        max_features = trial.suggest_float("max_features", 0.2, 1.)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)

        xtrain = train_df[train_df.kfold != fold].reset_index(drop=True)
        xvalid = train_df[train_df.kfold == fold].reset_index(drop=True)

        ytrain = xtrain.loc[:, "Habitability_score"]
        yvalid = xvalid.loc[:, "Habitability_score"]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

        ct = ColumnTransformer([
            ("num", transform, num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
        ])

        xtrain = ct.fit_transform(xtrain)
        xvalid = ct.transform(xvalid)

        model = RandomForestRegressor(random_state=13,
                                      n_estimators=500,
                                      **study.best_params
                                      )
        model.fit(xtrain, ytrain)
        preds_valid = model.predict(xvalid)
        scores.append(max(0, 100 * r2_score(yvalid, preds_valid)))

    return np.mean(scores)

In [None]:
obj_func = lambda trial: objective(trial, StandardScalar())

In [12]:
transforms = [StandardScaler(), FunctionTransformer(np.log1p)]

In [27]:
obj_func = lambda trial: objective(trial, StandardScaler())
study = optuna.create_study(direction="maximize")
study.optimize(obj_func, n_trials=10)

[32m[I 2022-08-21 06:46:49,815][0m A new study created in memory with name: no-name-26ae9d33-87ac-4cf7-b062-10fc0e098591[0m
[33m[W 2022-08-21 06:46:49,876][0m Trial 0 failed because of the following error: ValueError('No trials are completed yet.')[0m
Traceback (most recent call last):
  File "/home/tharun/.conda/envs/fastai/lib/python3.10/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_26017/1564889156.py", line 1, in <lambda>
    obj_func = lambda trial: objective(trial, StandardScaler())
  File "/tmp/ipykernel_26017/1159746422.py", line 29, in objective
    **study.best_params
  File "/home/tharun/.conda/envs/fastai/lib/python3.10/site-packages/optuna/study/study.py", line 60, in best_params
    return self.best_trial.params
  File "/home/tharun/.conda/envs/fastai/lib/python3.10/site-packages/optuna/study/study.py", line 97, in best_trial
    return copy.deepcopy(self._storage.get_best_trial(self._study

ValueError: No trials are completed yet.

In [46]:
def predict_save(transforms, model_name):
    for enum, transform in enumerate(transforms):
        objective = smart_objective(transform=transform)
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=10)
        
        final_valid_preds = {}
        test_scores = []

        for fold in range(5):
            model = RandomForestRegressor(random_state=13,
                                          n_estimators=500,
                                          **study.best_params
                                          )

            xtrain = train_df2[train_df2.kfold != fold].reset_index(drop=True)
            xvalid = train_df2[train_df2.kfold == fold].reset_index(drop=True)
            ytrain = xtrain.Habitability_score
            yvalid = xvalid.Habitability_score

            valid_ids = xvalid.Property_ID

            xtrain = xtrain[useful_features]
            xvalid = xvalid[useful_features]

            # transform features to log
            ct = ColumnTransformer([
                ("num", FunctionTransformer(np.log1p, validate=True), num_cols),
                ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
            ])

            xtrain = ct.fit_transform(xtrain)
            xvalid = ct.transform(xvalid)
            xtest = ct.transform(test_df2[useful_features])

            model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=False)
            preds_valid = model.predict(xvalid)
            print(f"fold: {fold}, score: {max(0, 100 * r2_score(yvalid, preds_valid))}")

            final_valid_preds.update(dict(zip(valid_ids, preds_valid)))
            test_scores.append(model.predict(xtest))

        final_valid_preds = pd.DataFrame.from_dict(final_valid_preds, orient="index").reset_index()
        final_valid_preds.columns = ["Property_ID", "pred_log"]
        final_valid_preds.to_csv(results_path/f"{model_name}_pred_2.csv", index=False)

        test_scores = np.mean(np.column_stack(test_scores), axis=1)
        subm_df = make_sub_file(test_df2.Property_ID, test_scores)
        subm_df.to_csv(results_path/f"{model_name}_pred_2.csv", index=False)
    
    print(f"-------------------- Finished {enum} transform --------------------------------")

In [47]:
transforms = [StandardScaler(), FunctionTransformer(np.log1p)]
predict_save(transforms=transforms, model_name="random_forest")

[32m[I 2022-08-20 21:35:10,693][0m A new study created in memory with name: no-name-4922a434-4989-4e6a-aa1a-ac043bf9db65[0m
[33m[W 2022-08-20 21:35:10,754][0m Trial 0 failed because of the following error: ValueError('No trials are completed yet.')[0m
Traceback (most recent call last):
  File "/home/tharun/.conda/envs/fastai/lib/python3.10/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_20418/2903063224.py", line 30, in objective
    **study.best_params
  File "/home/tharun/.conda/envs/fastai/lib/python3.10/site-packages/optuna/study/study.py", line 60, in best_params
    return self.best_trial.params
  File "/home/tharun/.conda/envs/fastai/lib/python3.10/site-packages/optuna/study/study.py", line 97, in best_trial
    return copy.deepcopy(self._storage.get_best_trial(self._study_id))
  File "/home/tharun/.conda/envs/fastai/lib/python3.10/site-packages/optuna/storages/_in_memory.py", line 311, in get_best_

ValueError: No trials are completed yet.