https://www.hackerearth.com/challenges/competitive/get-a-room-ml-hackathon/machine-learning/identify-the-habitability-score-of-a-property-12-464aae3e/

In [32]:
# imports

import os
from pathlib import Path

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [33]:
dataset_path = Path("/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/dataset/")
print([x.name for x in dataset_path.iterdir()])

['train.csv', 'train_v2.csv', 'subm_v3.csv', 'train_v1.csv', 'test_v1.csv', 'test.csv', 'sample_submission.csv']


In [34]:
results_path = Path("/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/blending_results")
results_path.as_posix()

'/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/blending_results'

In [35]:
train_df = pd.read_csv(dataset_path/"train_v2.csv")
test_df = pd.read_csv(dataset_path/"test_v1.csv")
sample_submission_df = pd.read_csv(dataset_path/"sample_submission.csv")

print(train_df.shape, test_df.shape)

(39500, 16) (10500, 14)


## utils

In [36]:
def make_sub_file(test_ids, preds):
    return pd.DataFrame.from_dict({"Property_ID": test_ids, "Habitability_score": preds})

## data preprocessing

In [37]:
useful_features = [col for col in train_df.columns if col not in ["Property_ID", "Habitability_score", "kfold"]]
useful_features

['Property_Type',
 'Property_Area',
 'Number_of_Windows',
 'Number_of_Doors',
 'Furnishing',
 'Frequency_of_Powercuts',
 'Power_Backup',
 'Water_Supply',
 'Traffic_Density_Score',
 'Crime_Rate',
 'Dust_and_Noise',
 'Air_Quality_Index',
 'Neighborhood_Review']

In [38]:
cat_cols = ["Property_Type", "Furnishing", "Power_Backup", "Water_Supply", "Crime_Rate", "Dust_and_Noise"]
num_cols = ["Property_Area", "Number_of_Windows", "Number_of_Doors", "Frequency_of_Powercuts", "Traffic_Density_Score", "Air_Quality_Index", "Neighborhood_Review"]

## Feature engineering

1. log transformation
2. polynomial features
3. target encoding

### log transformation

In [39]:
import optuna
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder

In [40]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.ensemble import RandomForestRegressor
import optuna

from sklearn.metrics import r2_score, mean_squared_error

In [41]:
train_df2 = train_df.copy(deep=True)
test_df2 = test_df.copy(deep=True)

assert train_df2 is not train_df
assert test_df2 is not test_df

In [42]:
def objective(trial):
    preds_scores = []
    for fold in range(5):
        xtrain = train_df2[train_df2.kfold != fold].reset_index(drop=True)
        xvalid = train_df2[train_df2.kfold == fold].reset_index(drop=True)
        
        ytrain = xtrain.Habitability_score
        yvalid = xvalid.Habitability_score
        
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        # transform features to log
        ct = ColumnTransformer([
            ("num", FunctionTransformer(np.log1p, validate=True), num_cols),
            ("cat", OrdinalEncoder(handle_unknown="use_encoded_value"), cat_cols)
        ])
        
        xtrain = ct.fit_transform(xtrain)
        xvalid = ct.transform(xvalid)
        
        # initialize suggestions for xgb params
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.3, log=True)
        reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.)
        max_depth = trial.suggest_int("max_depth", 2, 9)
        
        model = XGBRegressor(random_state=13,
                             learning_rate=learning_rate,
                             reg_lambda=reg_lambda,
                             reg_alpha=reg_alpha,
                             subsample=subsample,
                             colsample_bytree=colsample_bytree,
                             max_depth=max_depth,
                             n_estimators=7000,
                             gpu_id=0,
                             predictor="gpu_predictor",
                             tree_method="gpu_hist",
                            )
        model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=False)
        preds_valid = model.predict(xvalid)
        preds_scores.append(max(0, 100 * r2_score(yvalid, preds_valid)))
        
    return np.mean(preds_scores)

In [43]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[32m[I 2022-08-20 21:42:47,258][0m A new study created in memory with name: no-name-0bc64eee-de0a-42aa-97ce-637d29e1a4ce[0m
[33m[W 2022-08-20 21:42:47,268][0m Trial 0 failed because of the following error: TypeError("unknown_value should be an integer or np.nan when handle_unknown is 'use_encoded_value', got None.")[0m
Traceback (most recent call last):
  File "/home/tharun/.conda/envs/fastai/lib/python3.10/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_26272/2713725519.py", line 19, in objective
    xtrain = ct.fit_transform(xtrain)
  File "/home/tharun/.conda/envs/fastai/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 673, in fit_transform
    result = self._fit_transform(X, y, _fit_transform_one)
  File "/home/tharun/.conda/envs/fastai/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 604, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  F

TypeError: unknown_value should be an integer or np.nan when handle_unknown is 'use_encoded_value', got None.

In [None]:
print(study.best_params)

In [None]:
# optimized model out of log transforms

final_valid_preds = {}
test_scores = []

for fold in range(5):
    model = XGBRegressor(random_state=13,
                         n_estimators=7000,
                         **study.best_params)
    
    xtrain = train_df2[train_df2.kfold != fold].reset_index(drop=True)
    xvalid = train_df2[train_df2.kfold == fold].reset_index(drop=True)
    ytrain = xtrain.Habitability_score
    yvalid = xvalid.Habitability_score
    
    valid_ids = xvalid.Property_ID
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    # transform features to log
    ct = ColumnTransformer([
        ("num", FunctionTransformer(np.log1p, validate=True), num_cols),
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value"), cat_cols)
    ])

    xtrain = ct.fit_transform(xtrain)
    xvalid = ct.transform(xvalid)
    xtest = ct.transform(test_df2[useful_features])
    
    model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=False)
    preds_valid = model.predict(xvalid)
    print(f"fold: {fold}, score: {max(0, 100 * r2_score(yvalid, preds_valid))}")
    
    final_valid_preds.update(dict(zip(valid_ids, preds_valid)))
    test_scores.append(model.predict(xtest))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds, orient="index").reset_index()
final_valid_preds.columns = ["Property_ID", "pred_log"]
final_valid_preds.to_csv(results_path/"xgb_valid_pred_2.csv", index=False)

test_scores = np.mean(np.column_stack(test_scores), axis=1)
subm_df = make_sub_file(test_df2.Property_ID, test_scores)
subm_df.to_csv(results_path/"xgb_test_pred_2.csv", index=False)

### polynomial features

In [None]:
train_df3 = train_df.copy(deep=True)
test_df3 = test_df.copy(deep=True)
assert train_df3 is not train_df
assert test_df3 is not test_df

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import FeatureUnion

In [None]:
def objective(trial):
    preds_scores = []
    for fold in range(5):
        xtrain = train_df2[train_df2.kfold != fold].reset_index(drop=True)
        xvalid = train_df2[train_df2.kfold == fold].reset_index(drop=True)
        
        ytrain = xtrain.Habitability_score
        yvalid = xvalid.Habitability_score
        
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        # transform features to poly and add them to original
        ct = ColumnTransformer([
            ("num", StandardScaler(), num_cols)
        ])
        pt = ColumnTransformer([
            ("num", PolynomialFeatures(degree=3, interaction_only=True, include_bias=False), num_cols),
            ("cat", OrdinalEncoder(handle_unknown="use_encoded_value"), cat_cols)
        ])
        
        ft = FeatureUnion([
            ("orig", ct),
            ("poly", pt)
        ])
        
        xtrain = ft.fit_transform(xtrain)
        xvalid = ft.transform(xvalid)
        
        # initialize suggestions for xgb params
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.3, log=True)
        reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.)
        max_depth = trial.suggest_int("max_depth", 2, 9)
        
        model = XGBRegressor(random_state=13,
                             learning_rate=learning_rate,
                             reg_lambda=reg_lambda,
                             reg_alpha=reg_alpha,
                             subsample=subsample,
                             colsample_bytree=colsample_bytree,
                             max_depth=max_depth,
                             n_estimators=7000,
                             gpu_id=0,
                             predictor="gpu_predictor",
                             tree_method="gpu_hist",
                            )
        model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=False)
        preds_valid = model.predict(xvalid)
        preds_scores.append(max(0, 100 * r2_score(yvalid, preds_valid)))
        
    return np.mean(preds_scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
# optimized model out of log transforms

final_valid_preds = {}
test_scores = []

for fold in range(5):
    model = XGBRegressor(random_state=13,
                         n_estimators=7000,
                         **study.best_params)
    
    xtrain = train_df3[train_df3.kfold != fold].reset_index(drop=True)
    xvalid = train_df3[train_df3.kfold == fold].reset_index(drop=True)
    ytrain = xtrain.Habitability_score
    yvalid = xvalid.Habitability_score
    
    valid_ids = xvalid.Property_ID
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    # transform features to poly and add them to original
    ct = ColumnTransformer([
        ("num", StandardScaler(), num_cols)
    ])
    pt = ColumnTransformer([
        ("num", PolynomialFeatures(degree=3, interaction_only=True, include_bias=False), num_cols),
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value"), cat_cols)
    ])

    ft = FeatureUnion([
        ("orig", ct),
        ("poly", pt)
    ])

    xtrain = ft.fit_transform(xtrain)
    xvalid = ft.transform(xvalid)
    xtest = ft.transform(test_df3[useful_features])
    
    model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=False)
    preds_valid = model.predict(xvalid)
    print(f"fold: {fold}, score: {max(0, 100 * r2_score(yvalid, preds_valid))}")
    
    final_valid_preds.update(dict(zip(valid_ids, preds_valid)))
    test_scores.append(model.predict(xtest))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds, orient="index").reset_index()
final_valid_preds.columns = ["Property_ID", "pred_poly"]
final_valid_preds.to_csv(results_path/"xgb_valid_pred_3.csv", index=False)

test_scores = np.mean(np.column_stack(test_scores), axis=1)
subm_df = make_sub_file(test_df3.Property_ID, test_scores)
subm_df.to_csv(results_path/"xgb_test_pred_3.csv", index=False)

### target encoding

## merging XGB Blends

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
train_pred_1 = pd.read_csv(results_path/"xgb_valid_pred_1.csv")
train_pred_1.columns = ["Property_ID", "pred_1"]
train_pred_2 = pd.read_csv(results_path/"xgb_valid_pred_2.csv")
train_pred_2.columns = ["Property_ID", "pred_2"]
train_pred_3 = pd.read_csv(results_path/"xgb_valid_pred_3.csv")
train_pred_3.columns = ["Property_ID", "pred_3"]

test_pred_1 = pd.read_csv(results_path/"xgb_test_pred_1.csv")
test_pred_1.columns = ["Property_ID", "pred_1"]
test_pred_2 = pd.read_csv(results_path/"xgb_test_pred_2.csv")
test_pred_2.columns = ["Property_ID", "pred_2"]
test_pred_3 = pd.read_csv(results_path/"xgb_test_pred_3.csv")
test_pred_3.columns = ["Property_ID", "pred_3"]

In [None]:
train_df_final = train_df.merge(train_pred_1, on="Property_ID", how="left")
train_df_final = train_df_final.merge(train_pred_2, on="Property_ID", how="left")
train_df_final = train_df_final.merge(train_pred_3, on="Property_ID", how="left")
train_df_final.head()

In [None]:
test_df_final = test_df.merge(test_pred_1, on="Property_ID", how="left")
test_df_final = test_df_final.merge(test_pred_2, on="Property_ID", how="left")
test_df_final = test_df_final.merge(test_pred_3, on="Property_ID", how="left")
test_df_final.head()

In [None]:
useful_cols = ["pred_1", "pred_2", "pred_3"]

In [None]:
test_scores = []
for fold in range(5):
    xtrain = train_df_final[train_df_final.kfold != fold].reset_index(drop=True)
    xvalid = train_df_final[train_df_final.kfold == fold].reset_index(drop=True)
    xtest = test_df_final[useful_cols]
    
    ytrain = xtrain.Habitability_score
    yvalid = xvalid.Habitability_score
    
    xtrain = xtrain[useful_cols]
    xvalid = xvalid[useful_cols]
    
    lin_reg = LinearRegression()
    lin_reg.fit(xtrain, ytrain)
    valid_preds = lin_reg.predict(xvalid)
    print(f"fold: {fold}, score: {max(0, 100 * r2_score(yvalid, valid_preds))}")
    test_scores.append(lin_reg.predict(xtest))
test_scores = np.mean(np.column_stack(test_scores), axis=1)

In [None]:
lin_blend_df = make_sub_file(test_df_final.Property_ID, test_scores)
lin_blend_df.head()

In [None]:
lin_blend_df.to_csv(results_path/"xgb_blend_final_with_lr.csv", index=False)

In [None]:
train_df_final.to_csv(dataset_path/"train_xgb_blend_final.csv", index=False)
test_df_final.to_csv(dataset_path/"test_xgb_blend_final.csv", index=False)