https://www.hackerearth.com/challenges/competitive/get-a-room-ml-hackathon/machine-learning/identify-the-habitability-score-of-a-property-12-464aae3e/

In [1]:
# imports

import os
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
dataset_path = Path("/home/tharun/projects/data_science_competitions/datasets/he_habitability/")
print([x.name for x in dataset_path.iterdir()])

['train.csv', 'train_v1.csv', 'subm_v1.csv', 'sample_submission.csv', 'test.csv', 'test_v1.csv']


In [37]:
train_df = pd.read_csv(dataset_path/"train_v1.csv")
test_df = pd.read_csv(dataset_path/"test_v1.csv")
sample_submission_df = pd.read_csv(dataset_path/"sample_submission.csv")

print(train_df.shape, test_df.shape)

(39499, 15) (10500, 14)


## Data preprocessing

In [46]:
# randomly duplicate a row item from train so that k-folds can be evenly split
train_df = pd.concat([train_df, train_df.sample(1, random_state=13)]).reset_index(drop=True)
train_df.shape

(39500, 16)

In [47]:
train_df.columns

Index(['Property_ID', 'Property_Type', 'Property_Area', 'Number_of_Windows',
       'Number_of_Doors', 'Furnishing', 'Frequency_of_Powercuts',
       'Power_Backup', 'Water_Supply', 'Traffic_Density_Score', 'Crime_Rate',
       'Dust_and_Noise', 'Air_Quality_Index', 'Neighborhood_Review',
       'Habitability_score', 'kfold'],
      dtype='object')

### Create folds

In [48]:
from sklearn.model_selection import KFold

In [49]:
kf = KFold(n_splits=5, shuffle=True, random_state=13)
kf

KFold(n_splits=5, random_state=13, shuffle=True)

In [50]:
train_df.loc[:, "kfold"] = -1
train_df.head()

Unnamed: 0,Property_ID,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review,Habitability_score,kfold
0,0x21e3,Apartment,106,4.0,1,Semi_Furnished,0.0,No,Once in a day - Morning,5.89,Slightly below average,Medium,90.0,3.86,71.98,-1
1,0x68d4,Apartment,733,2.0,2,Unfurnished,1.0,No,Once in a day - Evening,4.37,Well below average,Medium,96.0,3.55,71.2,-1
2,0x7d81,Apartment,737,4.0,2,Fully Furnished,0.0,No,Once in a day - Morning,7.45,Slightly below average,Medium,121.0,3.81,71.39,-1
3,0x7a57,Apartment,900,3.0,2,Unfurnished,2.0,Yes,Once in a day - Morning,6.16,Well above average,Medium,100.0,1.34,31.46,-1
4,0x9409,Bungalow,2238,14.0,6,Fully Furnished,0.0,No,All time,5.46,Well below average,Medium,116.0,4.77,93.7,-1


In [51]:
for fold, (train_indices, test_indices) in enumerate(kf.split(train_df)):
    train_df.loc[test_indices, "kfold"] = fold
    
train_df.kfold.value_counts()

4    7900
2    7900
3    7900
0    7900
1    7900
Name: kfold, dtype: int64

In [52]:
train_df.to_csv(dataset_path/"train_v2.csv", index=False)

In [23]:
useful_features = [col for col in train_df.columns if col not in ["Property_ID", "Habitability_score", "kfold"]]
useful_features

['Property_Type',
 'Property_Area',
 'Number_of_Windows',
 'Number_of_Doors',
 'Furnishing',
 'Frequency_of_Powercuts',
 'Power_Backup',
 'Water_Supply',
 'Traffic_Density_Score',
 'Crime_Rate',
 'Dust_and_Noise',
 'Air_Quality_Index',
 'Neighborhood_Review']

In [24]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score 

  from pandas import MultiIndex, Int64Index


In [25]:
cat_cols = ["Property_Type", "Furnishing", "Power_Backup", "Water_Supply", "Crime_Rate", "Dust_and_Noise"]
num_cols = ["Property_Area", "Number_of_Windows", "Number_of_Doors", "Frequency_of_Powercuts", "Traffic_Density_Score", "Air_Quality_Index", "Neighborhood_Review"]

In [26]:
ct = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)
])

linear_pipe = Pipeline([
    ("preprocess", ct),
    ("regression", LinearRegression())
])

regressor_pipe = Pipeline([
    ("preprocess", ct),
    ("regressor", GradientBoostingRegressor())
])

xgb_pipe = Pipeline([
    ("preprocess", ct),
    ("regressor", XGBRegressor())
])

In [34]:
test_preds_collection = []

for fold in range(5):
    xtrain = train_df[train_df.kfold != fold].reset_index(drop=True)
    xtest = train_df[train_df.kfold == fold].reset_index(drop=True)
    ytrain = xtrain.Habitability_score
    ytest = xtest.Habitability_score
    
    xtrain = xtrain[useful_features]
    xtest = xtest[useful_features]
    test_df = test_df[useful_features]
    
    # model
    xgb_pipe = Pipeline([
        ("preprocess", ct),
        ("regressor", XGBRegressor())
    ])
    xgb_pipe.fit(xtrain, ytrain)
    xtest_preds = xgb_pipe.predict(xtest)
    
    # predict for test
    test_preds = xgb_pipe.predict(test_df)
    test_preds_collection.append(test_preds)
    
    mse = max(0, 100 * r2_score(ytest, xtest_preds))
    print(f"fold: {fold}, mse: {mse}")

fold: 0, mse: 80.80481384722755
fold: 1, mse: 80.8356447857457
fold: 2, mse: 81.58155208207451
fold: 3, mse: 80.10486734340392
fold: 4, mse: 80.29031688002533


In [35]:
test_preds_avg = np.average(np.column_stack(test_preds_collection), axis=1)
test_preds_avg.shape

(10500,)

In [80]:
def make_sub_file(test_ids, preds):
    return pd.DataFrame.from_dict({"Property_ID": test_ids, "Habitability_score": preds})

In [43]:
submission_df = pd.DataFrame.from_dict({"Property_ID": test_df.Property_ID.values, "Habitability_score": test_preds_avg})
submission_df.head()

Unnamed: 0,Property_ID,Habitability_score
0,0x6e93,27.849039
1,0x8787,80.22966
2,0x6c17,66.846313
3,0x9dbd,72.142227
4,0xbfde,78.448624


In [44]:
submission_df.to_csv(dataset_path/"subm_v2.csv", index=False)