In [4]:
from pathlib import Path

import pandas as pd
import numpy as np

In [3]:
dataset_path = Path("./dataset/")
print([x.name for x in dataset_path.iterdir()])

['train.csv', 'train_v1.csv', 'test_v1.csv', 'test.csv', 'sample_submission.csv']


In [83]:
train_df = pd.read_csv(dataset_path/"train.csv")
test_df = pd.read_csv(dataset_path/"test.csv")

print(train_df.isna().sum().sum())
print(test_df.isna().sum().sum())

train_df.shape, test_df.shape

6085
1610


((39499, 15), (10500, 14))

In [84]:
cat_cols = ["Property_Type", "Furnishing", "Power_Backup", "Water_Supply", "Crime_Rate", "Dust_and_Noise"]
num_cols = ["Property_Area", "Number_of_Windows", "Number_of_Doors", "Frequency_of_Powercuts", "Traffic_Density_Score", "Air_Quality_Index", "Neighborhood_Review"]

In [124]:
def make_sub_file(test_ids, preds):
    return pd.DataFrame.from_dict({"Property_ID": test_ids, "Habitability_score": preds})

In [None]:
results_path = Path("./results")
results_path.mkdir(exist_ok=True, parents=True)

In [85]:
from fastai.tabular.all import *

In [86]:
splits = RandomSplitter(valid_pct=0.2)(range_of(train_df))

In [87]:
to = TabularPandas(train_df, procs=[Categorify, FillMissing, Normalize],
                   cat_names=cat_cols, cont_names=num_cols, y_names="Habitability_score",
                   splits=splits, y_block=RegressionBlock)

In [89]:
dls = to.dataloaders(bs=512)
dls.show_batch(max_n=10)

Unnamed: 0,Property_Type,Furnishing,Power_Backup,Water_Supply,Crime_Rate,Dust_and_Noise,Number_of_Windows_na,Frequency_of_Powercuts_na,Property_Area,Number_of_Windows,Number_of_Doors,Frequency_of_Powercuts,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Habitability_score
0,Apartment,Unfurnished,Yes,All time,Slightly below average,Medium,False,False,488.999996,4.0,1.0,1.0,7.98,128.0,3.49,81.449997
1,Apartment,Semi_Furnished,Yes,All time,Slightly below average,Medium,False,False,19865.999646,2.0,3.0,1.0,6.62,116.0,3.51,70.830002
2,Bungalow,Fully Furnished,No,Once in a day - Evening,Slightly below average,Low,False,False,2086.00002,12.0,6.0,2.0,3.41,39.000002,2.44,74.650002
3,Container Home,Unfurnished,No,Once in a day - Morning,Slightly below average,Medium,False,False,477.999974,-1.529412e-07,1.0,1.0,5.04,109.0,3.22,34.639999
4,Single-family home,Semi_Furnished,No,Once in a day - Morning,Well below average,Medium,False,False,2118.000025,4.0,1.0,-1.395369e-08,7.91,114.0,4.45,68.5
5,Apartment,Fully Furnished,No,Once in a day - Morning,Well above average,Medium,False,False,119.999963,1.0,2.0,-1.395369e-08,7.46,159.999998,2.51,64.519997
6,Apartment,Semi_Furnished,No,Once in two days,Slightly above average,Medium,False,False,224.000053,2.0,3.0,-1.395369e-08,5.23,115.0,2.58,39.650002
7,Bungalow,Semi_Furnished,No,Once in a day - Morning,Slightly below average,High,False,False,3392.999991,6.0,4.0,-1.395369e-08,4.1,81.000001,3.82,81.190002
8,Apartment,Semi_Furnished,No,All time,Well below average,High,False,False,659.999986,3.0,1.0,-1.395369e-08,8.1,176.0,4.62,76.870003
9,Bungalow,Unfurnished,No,All time,Well below average,Medium,False,False,4020.000012,11.0,6.0,-1.395369e-08,5.93,83.999999,4.81,75.760002


In [134]:
learner = tabular_learner(dls, layers=[75, 25, 5], metrics=rmse)
learner

learner.fit(75)

epoch,train_loss,valid_loss,_rmse,time
0,5504.658691,5497.942871,74.148117,00:00
1,5472.104004,5425.95752,73.661095,00:00
2,5430.859863,5354.849121,73.176834,00:00
3,5377.464355,5327.147461,72.987312,00:00
4,5315.945312,5262.071777,72.540138,00:00
5,5248.758789,5193.952148,72.069077,00:00
6,5178.068359,5144.094727,71.722343,00:00
7,5097.199707,5019.062012,70.845337,00:00
8,5014.4375,4935.086914,70.250175,00:00
9,4924.179688,4840.87793,69.576424,00:00


In [135]:
learner.show_results()

dl = learner.dls.test_dl(test_df)
test_preds = learner.get_preds(dl=dl)

subm_df = make_sub_file(test_df.Property_ID.values, test_preds[0].squeeze())
subm_df.head()

subm_df.to_csv(results_path/"fastai_subm_v1.csv")

Unnamed: 0,Property_Type,Furnishing,Power_Backup,Water_Supply,Crime_Rate,Dust_and_Noise,Number_of_Windows_na,Frequency_of_Powercuts_na,Property_Area,Number_of_Windows,Number_of_Doors,Frequency_of_Powercuts,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Habitability_score,Habitability_score_pred
0,3.0,2.0,2.0,1.0,4.0,3.0,1.0,1.0,0.708601,2.686808,1.418088,-0.626228,0.987535,-0.008605,1.271302,78.589996,78.488327
1,5.0,2.0,2.0,5.0,4.0,3.0,1.0,1.0,-0.463825,0.027709,-0.276945,3.408459,0.239957,-0.389631,-2.219125,71.550003,56.152367
2,2.0,3.0,2.0,4.0,4.0,3.0,1.0,1.0,-0.533328,-1.111905,-1.124461,-0.626228,-0.047573,0.689943,0.819304,73.610001,69.173897
3,6.0,2.0,2.0,4.0,4.0,3.0,1.0,1.0,0.286,-0.352162,-0.276945,-0.626228,0.478196,-0.580144,0.944858,70.459999,72.171013
4,6.0,2.0,3.0,1.0,4.0,3.0,1.0,1.0,0.158154,-0.352162,-1.124461,0.718668,0.683574,0.13428,0.480305,86.75,84.211685
5,3.0,3.0,2.0,5.0,2.0,3.0,1.0,1.0,1.312316,2.686808,2.265604,0.718668,-1.600235,-0.627772,-1.390462,37.16,45.592392
6,3.0,1.0,3.0,1.0,4.0,1.0,2.0,1.0,0.542199,0.027709,-0.276945,0.718668,2.334818,1.197978,0.279418,92.379997,86.649612
7,6.0,2.0,2.0,4.0,2.0,2.0,1.0,1.0,-0.073693,0.40758,-1.124461,-0.626228,-0.047573,0.293041,0.153863,77.269997,82.388168
8,6.0,1.0,2.0,4.0,1.0,1.0,1.0,1.0,0.260127,0.40758,0.570571,-0.626228,1.644746,1.213854,-0.900798,83.610001,80.611053


In [136]:
# 84.83