https://www.hackerearth.com/challenges/competitive/get-a-room-ml-hackathon/machine-learning/identify-the-habitability-score-of-a-property-12-464aae3e/

In [1]:
# imports

import os
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
dataset_path = Path("/home/tharun/projects/data_science_competitions/datasets/he_habitability/")
print([x.name for x in dataset_path.iterdir()])

['train_v2.csv', 'train.csv', 'train_v1.csv', 'subm_v1.csv', 'subm_v2.csv', 'sample_submission.csv', 'subm_v3.csv', 'test.csv', 'test_v1.csv']


In [6]:
results_path = Path("/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/blending_results")
results_path.as_posix()

'/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/blending_results'

In [3]:
train_df = pd.read_csv(dataset_path/"train_v2.csv")
test_df = pd.read_csv(dataset_path/"test_v1.csv")
sample_submission_df = pd.read_csv(dataset_path/"sample_submission.csv")

print(train_df.shape, test_df.shape)

(39500, 16) (10500, 14)


## utils

In [4]:
def make_sub_file(test_ids, preds):
    return pd.DataFrame.from_dict({"Property_ID": test_ids, "Habitability_score": preds})

## data preprocessing

In [5]:
useful_features = [col for col in train_df.columns if col not in ["Property_ID", "Habitability_score", "kfold"]]
useful_features

['Property_Type',
 'Property_Area',
 'Number_of_Windows',
 'Number_of_Doors',
 'Furnishing',
 'Frequency_of_Powercuts',
 'Power_Backup',
 'Water_Supply',
 'Traffic_Density_Score',
 'Crime_Rate',
 'Dust_and_Noise',
 'Air_Quality_Index',
 'Neighborhood_Review']

In [8]:
cat_cols = ["Property_Type", "Furnishing", "Power_Backup", "Water_Supply", "Crime_Rate", "Dust_and_Noise"]
num_cols = ["Property_Area", "Number_of_Windows", "Number_of_Doors", "Frequency_of_Powercuts", "Traffic_Density_Score", "Air_Quality_Index", "Neighborhood_Review"]

## Feature engineering

1. log transformation
2. polynomial features
3. target encoding

### log transformation

In [21]:
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

In [14]:
train_df2 = train_df.copy(deep=True)
test_df2 = test_df.copy(deep=True)

assert train_df2 is not train_df
assert test_df2 is not test_df

In [83]:
def objective(trial):
    preds_scores = []
    for fold in range(5):
        xtrain = train_df2[train_df2.kfold != fold].reset_index(drop=True)
        xvalid = train_df2[train_df2.kfold == fold].reset_index(drop=True)
        
        ytrain = xtrain.Habitability_score
        yvalid = xvalid.Habitability_score
        
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        # transform features to log
        ct = ColumnTransformer([
            ("num", FunctionTransformer(np.log1p, validate=True), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
        ])
        
        xtrain = ct.fit_transform(xtrain)
        xvalid = ct.transform(xvalid)
        
        # initialize suggestions for xgb params
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.3, log=True)
        reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.)
        max_depth = trial.suggest_int("max_depth", 2, 9)
        
        model = XGBRegressor(random_state=13,
                             learning_rate=learning_rate,
                             reg_lambda=reg_lambda,
                             reg_alpha=reg_alpha,
                             subsample=subsample,
                             colsample_bytree=colsample_bytree,
                             max_depth=max_depth,
                             n_estimators=7000,
                             gpu_id=0,
                             predictor="gpu_predictor",
                             tree_method="gpu_hist",
                            )
        model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=False)
        preds_valid = model.predict(xvalid)
        preds_scores.append(max(0, 100 * r2_score(yvalid, preds_valid)))
        
    return np.mean(preds_scores)

In [84]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[32m[I 2022-08-13 21:24:16,328][0m A new study created in memory with name: no-name-0712eb68-4a28-4c9e-b547-dd9b6ab1c508[0m
[32m[I 2022-08-13 21:24:38,950][0m Trial 0 finished with value: 79.80766549668762 and parameters: {'learning_rate': 0.06366361872507531, 'reg_lambda': 77.71767078844377, 'reg_alpha': 86.62167394331276, 'subsample': 0.5412682954403389, 'colsample_bytree': 0.5001003206109763, 'max_depth': 4}. Best is trial 0 with value: 79.80766549668762.[0m
[32m[I 2022-08-13 21:24:55,085][0m Trial 1 finished with value: 78.81030936069513 and parameters: {'learning_rate': 0.11074205900227456, 'reg_lambda': 84.51596166954033, 'reg_alpha': 63.006637392806944, 'subsample': 0.6335608288311084, 'colsample_bytree': 0.5736665250988356, 'max_depth': 3}. Best is trial 0 with value: 79.80766549668762.[0m
[32m[I 2022-08-13 21:25:02,230][0m Trial 2 finished with value: 74.05604672507695 and parameters: {'learning_rate': 0.2937990275877612, 'reg_lambda': 62.77457332682678, 'reg_alpha'

In [85]:
print(study.best_params)

{'learning_rate': 0.010475389437198215, 'reg_lambda': 2.311778762993713, 'reg_alpha': 5.706880743706972, 'subsample': 0.950259109988838, 'colsample_bytree': 0.7644084809019013, 'max_depth': 9}


In [86]:
# optimized model out of log transforms

final_valid_preds = {}
test_scores = []

for fold in range(5):
    model = XGBRegressor(random_state=13,
                         n_estimators=7000,
                         **study.best_params)
    
    xtrain = train_df2[train_df2.kfold != fold].reset_index(drop=True)
    xvalid = train_df2[train_df2.kfold == fold].reset_index(drop=True)
    ytrain = xtrain.Habitability_score
    yvalid = xvalid.Habitability_score
    
    valid_ids = xvalid.Property_ID
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    # transform features to log
    ct = ColumnTransformer([
        ("num", FunctionTransformer(np.log1p, validate=True), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
    ])

    xtrain = ct.fit_transform(xtrain)
    xvalid = ct.transform(xvalid)
    xtest = ct.transform(test_df2[useful_features])
    
    model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=False)
    preds_valid = model.predict(xvalid)
    print(f"fold: {fold}, score: {max(0, 100 * r2_score(yvalid, preds_valid))}")
    
    final_valid_preds.update(dict(zip(valid_ids, preds_valid)))
    test_scores.append(model.predict(xtest))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds, orient="index").reset_index()
final_valid_preds.columns = ["Property_ID", "pred_log"]
final_valid_preds.to_csv(results_path/"xgb_valid_pred_2.csv", index=False)

test_scores = np.mean(np.column_stack(test_scores), axis=1)
subm_df = make_sub_file(test_df2.Property_ID, test_scores)
subm_df.to_csv(results_path/"xgb_test_pred_2.csv", index=False)

fold: 0, score: 82.66720585636159
fold: 1, score: 82.33683516952252
fold: 2, score: 83.3705788274427
fold: 3, score: 81.84743390976598
fold: 4, score: 82.36448382448015


### polynomial features

In [87]:
train_df3 = train_df.copy(deep=True)
test_df3 = test_df.copy(deep=True)
assert train_df3 is not train_df
assert test_df3 is not test_df

In [88]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import FeatureUnion

In [89]:
def objective(trial):
    preds_scores = []
    for fold in range(5):
        xtrain = train_df2[train_df2.kfold != fold].reset_index(drop=True)
        xvalid = train_df2[train_df2.kfold == fold].reset_index(drop=True)
        
        ytrain = xtrain.Habitability_score
        yvalid = xvalid.Habitability_score
        
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        # transform features to poly and add them to original
        ct = ColumnTransformer([
            ("num", StandardScaler(), num_cols)
        ])
        pt = ColumnTransformer([
            ("num", PolynomialFeatures(degree=3, interaction_only=True, include_bias=False), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
        ])
        
        ft = FeatureUnion([
            ("orig", ct),
            ("poly", pt)
        ])
        
        xtrain = ft.fit_transform(xtrain)
        xvalid = ft.transform(xvalid)
        
        # initialize suggestions for xgb params
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.3, log=True)
        reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.)
        max_depth = trial.suggest_int("max_depth", 2, 9)
        
        model = XGBRegressor(random_state=13,
                             learning_rate=learning_rate,
                             reg_lambda=reg_lambda,
                             reg_alpha=reg_alpha,
                             subsample=subsample,
                             colsample_bytree=colsample_bytree,
                             max_depth=max_depth,
                             n_estimators=7000,
                             gpu_id=0,
                             predictor="gpu_predictor",
                             tree_method="gpu_hist",
                            )
        model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=False)
        preds_valid = model.predict(xvalid)
        preds_scores.append(max(0, 100 * r2_score(yvalid, preds_valid)))
        
    return np.mean(preds_scores)

In [90]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[32m[I 2022-08-13 22:56:53,074][0m A new study created in memory with name: no-name-3c14fb91-b0de-4593-b693-c16fe92c7c30[0m
[32m[I 2022-08-13 22:57:15,686][0m Trial 0 finished with value: 80.94711065848135 and parameters: {'learning_rate': 0.06858367300306163, 'reg_lambda': 86.7297640740148, 'reg_alpha': 26.334570216689706, 'subsample': 0.773558586955264, 'colsample_bytree': 0.9474614317053142, 'max_depth': 7}. Best is trial 0 with value: 80.94711065848135.[0m
[32m[I 2022-08-13 22:57:26,233][0m Trial 1 finished with value: 77.37442214398723 and parameters: {'learning_rate': 0.0661886974509071, 'reg_lambda': 52.10186797205989, 'reg_alpha': 26.948743488097143, 'subsample': 0.13782744166103456, 'colsample_bytree': 0.8624842392897142, 'max_depth': 4}. Best is trial 0 with value: 80.94711065848135.[0m
[32m[I 2022-08-13 22:57:45,084][0m Trial 2 finished with value: 78.99702847311946 and parameters: {'learning_rate': 0.027968751475905978, 'reg_lambda': 73.709844198776, 'reg_alpha':

In [91]:
# optimized model out of log transforms

final_valid_preds = {}
test_scores = []

for fold in range(5):
    model = XGBRegressor(random_state=13,
                         n_estimators=7000,
                         **study.best_params)
    
    xtrain = train_df3[train_df3.kfold != fold].reset_index(drop=True)
    xvalid = train_df3[train_df3.kfold == fold].reset_index(drop=True)
    ytrain = xtrain.Habitability_score
    yvalid = xvalid.Habitability_score
    
    valid_ids = xvalid.Property_ID
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    # transform features to poly and add them to original
    ct = ColumnTransformer([
        ("num", StandardScaler(), num_cols)
    ])
    pt = ColumnTransformer([
        ("num", PolynomialFeatures(degree=3, interaction_only=True, include_bias=False), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
    ])

    ft = FeatureUnion([
        ("orig", ct),
        ("poly", pt)
    ])

    xtrain = ft.fit_transform(xtrain)
    xvalid = ft.transform(xvalid)
    xtest = ft.transform(test_df3[useful_features])
    
    model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=False)
    preds_valid = model.predict(xvalid)
    print(f"fold: {fold}, score: {max(0, 100 * r2_score(yvalid, preds_valid))}")
    
    final_valid_preds.update(dict(zip(valid_ids, preds_valid)))
    test_scores.append(model.predict(xtest))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds, orient="index").reset_index()
final_valid_preds.columns = ["Property_ID", "pred_poly"]
final_valid_preds.to_csv(results_path/"xgb_valid_pred_3.csv", index=False)

test_scores = np.mean(np.column_stack(test_scores), axis=1)
subm_df = make_sub_file(test_df3.Property_ID, test_scores)
subm_df.to_csv(results_path/"xgb_test_pred_3.csv", index=False)

fold: 0, score: 82.5248713971776
fold: 1, score: 82.20347020699417
fold: 2, score: 83.33574620316186
fold: 3, score: 81.70825269624538
fold: 4, score: 82.27810324733395


### target encoding

## merging XGB Blends

In [92]:
from sklearn.linear_model import LinearRegression

In [93]:
train_pred_1 = pd.read_csv(results_path/"xgb_valid_pred_1.csv")
train_pred_1.columns = ["Property_ID", "pred_1"]
train_pred_2 = pd.read_csv(results_path/"xgb_valid_pred_2.csv")
train_pred_2.columns = ["Property_ID", "pred_2"]
train_pred_3 = pd.read_csv(results_path/"xgb_valid_pred_3.csv")
train_pred_3.columns = ["Property_ID", "pred_3"]

test_pred_1 = pd.read_csv(results_path/"xgb_test_pred_1.csv")
test_pred_1.columns = ["Property_ID", "pred_1"]
test_pred_2 = pd.read_csv(results_path/"xgb_test_pred_2.csv")
test_pred_2.columns = ["Property_ID", "pred_2"]
test_pred_3 = pd.read_csv(results_path/"xgb_test_pred_3.csv")
test_pred_3.columns = ["Property_ID", "pred_3"]

In [94]:
train_df_final = train_df.merge(train_pred_1, on="Property_ID", how="left")
train_df_final = train_df_final.merge(train_pred_2, on="Property_ID", how="left")
train_df_final = train_df_final.merge(train_pred_3, on="Property_ID", how="left")
train_df_final.head()

Unnamed: 0,Property_ID,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review,Habitability_score,kfold,pred_1,pred_2,pred_3
0,0x21e3,Apartment,106,4.0,1,Semi_Furnished,0.0,No,Once in a day - Morning,5.89,Slightly below average,Medium,90.0,3.86,71.98,4,73.04447,72.62665,72.73324
1,0x68d4,Apartment,733,2.0,2,Unfurnished,1.0,No,Once in a day - Evening,4.37,Well below average,Medium,96.0,3.55,71.2,4,62.92619,64.262856,65.53581
2,0x7d81,Apartment,737,4.0,2,Fully Furnished,0.0,No,Once in a day - Morning,7.45,Slightly below average,Medium,121.0,3.81,71.39,4,75.65016,74.56163,74.587006
3,0x7a57,Apartment,900,3.0,2,Unfurnished,2.0,Yes,Once in a day - Morning,6.16,Well above average,Medium,100.0,1.34,31.46,2,40.68893,40.25178,40.432903
4,0x9409,Bungalow,2238,14.0,6,Fully Furnished,0.0,No,All time,5.46,Well below average,Medium,116.0,4.77,93.7,3,92.74415,93.0251,91.78649


In [95]:
test_df_final = test_df.merge(test_pred_1, on="Property_ID", how="left")
test_df_final = test_df_final.merge(test_pred_2, on="Property_ID", how="left")
test_df_final = test_df_final.merge(test_pred_3, on="Property_ID", how="left")
test_df_final.head()

Unnamed: 0,Property_ID,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review,pred_1,pred_2,pred_3
0,0x6e93,Apartment,293,3.0,1,Unfurnished,0.0,No,Once in a day - Morning,7.28,Well above average,Medium,152.0,2.52,28.60104,28.989716,29.206625
1,0x8787,Apartment,586,4.0,1,Semi_Furnished,0.0,No,Once in a day - Evening,7.63,Well below average,Medium,92.0,4.16,80.304886,80.272766,80.04182
2,0x6c17,Container Home,305,1.0,2,Semi_Furnished,1.0,No,All time,5.39,Slightly above average,Medium,90.0,2.92,67.08022,67.36132,67.10294
3,0x9dbd,Apartment,258,2.0,1,Semi_Furnished,1.0,No,All time,7.53,Slightly below average,Medium,158.0,3.45,72.995995,71.91322,71.27663
4,0xbfde,Bungalow,3031,12.0,4,Fully Furnished,0.0,No,All time,8.79,Well above average,High,186.0,2.72,78.20897,77.1357,77.96796


In [96]:
useful_cols = ["pred_1", "pred_2", "pred_3"]

In [97]:
test_scores = []
for fold in range(5):
    xtrain = train_df_final[train_df_final.kfold != fold].reset_index(drop=True)
    xvalid = train_df_final[train_df_final.kfold == fold].reset_index(drop=True)
    xtest = test_df_final[useful_cols]
    
    ytrain = xtrain.Habitability_score
    yvalid = xvalid.Habitability_score
    
    xtrain = xtrain[useful_cols]
    xvalid = xvalid[useful_cols]
    
    lin_reg = LinearRegression()
    lin_reg.fit(xtrain, ytrain)
    valid_preds = lin_reg.predict(xvalid)
    print(f"fold: {fold}, score: {max(0, 100 * r2_score(yvalid, valid_preds))}")
    test_scores.append(lin_reg.predict(xtest))
test_scores = np.mean(np.column_stack(test_scores), axis=1)

fold: 0, score: 82.79427483265572
fold: 1, score: 82.52678763373781
fold: 2, score: 83.59708793312673
fold: 3, score: 82.06025054362807
fold: 4, score: 82.57622009884649


In [98]:
lin_blend_df = make_sub_file(test_df_final.Property_ID, test_scores)
lin_blend_df.head()

Unnamed: 0,Property_ID,Habitability_score
0,0x6e93,28.36939
1,0x8787,80.332705
2,0x6c17,67.216178
3,0x9dbd,71.449529
4,0xbfde,77.479452


In [99]:
lin_blend_df.to_csv(results_path/"xgb_blend_final_with_lr.csv", index=False)

In [100]:
train_df_final.to_csv(dataset_path/"train_xgb_blend_final.csv", index=False)
test_df_final.to_csv(dataset_path/"test_xgb_blend_final.csv", index=False)