# Workspace initialisation

Kaggle datasets not in version control, but should be in the subdirectory `data/`:
``` 
    data/
    ├── auxiliary-data/...
    ├── sample_submission.csv
    ├── test.csv
    └── train.csv
```


In [74]:
import os
from pathlib import Path

# Assume that if data/train.csv exists then all the other files also exist
should_download = not Path("data").exists() or not (Path("data") / "train.csv").exists()

if should_download:
    import opendatasets as od

    PROJECT_URL = "https://www.kaggle.com/competitions/cs5228-202223-s2-location-location-location"
    SAMPLE_SUBMISSION_CSV = "https://bhooi.github.io/teaching/cs5228/kaggle/sample-submission.csv"
    print("Downloading datasets")

    od.download(PROJECT_URL)
    os.rename("cs5228-202223-s2-location-location-location", "data")
    od.download(SAMPLE_SUBMISSION_CSV, data_dir="data")


In [75]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint

# Load data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")



# Data cleaning

- **`month` can be coerced to a pandas `datetime` object**
- **`flat_type` needs cleaning (`4 room` vs `4-room`)**
- **`street_name` needs cleaning (lowercase vs Capitalised)**
- **`storey_range` needs cleaning (`01 to 03` vs `01 to 05`); we could use target encoding and take the median of the range**
- `town`, `floor_area_sqm`, `flat_model`, `lease_commence_date`, `latitude`, `longitude`, `subzone`, `planning_area`, `region`, `resale_price` look good
- `eco_category` can probably be dropped -- only a single value `uncategorized` -- is this data available elsewhere?
- **`elevation` looks bad, all values are filled with `0.0` -- drop or impute the land elevation based on `latitude` and `longitude`?**



`NaNs`: `df.isna().sum()` returns `0` for all columns

Q: Can `flat_type` be an ordinal variable? How does `executive` and `multi generation` fit into the ordering?



In [76]:
train_df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,01 to 03,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0
1,2014-10,punggol,5-room,196B,punggol field,10 to 12,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0
2,2020-09,sengkang,5 room,404A,fernvale lane,01 to 03,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0
3,2000-10,clementi,3 room,375,clementi avenue 4,07 to 09,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0
4,2013-01,bukit batok,3-room,163,bukit batok street 11,07 to 09,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0


In [77]:
# Helper functions
def median_storey(r: str) -> int:
    start, stop = r.split(" to ")
    return (int(start) + int(stop)) // 2

# Ensure helper function works as expected
all_ranges = set(train_df["storey_range"].unique()) | set(test_df["storey_range"].unique())
for r in sorted(all_ranges):
    print(f"{r} -> {median_storey(r)}")

01 to 03 -> 2
01 to 05 -> 3
04 to 06 -> 5
06 to 10 -> 8
07 to 09 -> 8
10 to 12 -> 11
11 to 15 -> 13
13 to 15 -> 14
16 to 18 -> 17
16 to 20 -> 18
19 to 21 -> 20
21 to 25 -> 23
22 to 24 -> 23
25 to 27 -> 26
26 to 30 -> 28
28 to 30 -> 29
31 to 33 -> 32
31 to 35 -> 33
34 to 36 -> 35
36 to 40 -> 38
37 to 39 -> 38
40 to 42 -> 41
43 to 45 -> 44
46 to 48 -> 47
49 to 51 -> 50


In [78]:
def clean_df(df: pd.DataFrame):
    """
    Modifies the dataframe in place
    """
    # Convert month to datetime
    df["month"] = pd.to_datetime(df["month"], format="%Y-%m")

    # Convert all string categorical columns to lowercase
    for col in df.select_dtypes("object").columns:
        df[col] = df[col].str.lower()
    
    # Block is a special case, uppercase this
    df["block"] = df["block"].str.upper()

    # Normalize flat_type
    df["flat_type"] = df["flat_type"].str.replace("-", " ")

    # Normalize storey_range to be the median storey in the range
    df["median_storey"] = df["storey_range"].apply(median_storey)

    # Eyeballing results from the above, we can see that median_storey is correct, so we can drop storey_range
    df.drop(columns=["storey_range"], inplace=True)


In [79]:
train_df_cleaned = train_df.copy()
clean_df(train_df_cleaned)

test_df_cleaned = test_df.copy()
clean_df(test_df_cleaned)

In [80]:
train_df_cleaned.head()

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,median_storey
0,2001-08-01,pasir ris,4 room,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0,2
1,2014-10-01,punggol,5 room,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0,11
2,2020-09-01,sengkang,5 room,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0,2
3,2000-10-01,clementi,3 room,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0,8
4,2013-01-01,bukit batok,3 room,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0,8


In [81]:
# Ensure flat_type got normalized correctly
train_df_cleaned.flat_type.unique()

array(['4 room', '5 room', '3 room', 'executive', '2 room',
       'multi generation', '1 room'], dtype=object)

In [84]:
# Data consistency checks
for col in train_df_cleaned.select_dtypes("object").columns:
    # Note: Elements in `block` and `street_name` do not completely overlap between train and test
    if col == "block" or col == "street_name":
        continue
    assert set(train_df_cleaned[col].unique()) == set(test_df_cleaned[col].unique()), f"{col} is not consistent"

In [85]:
# NaN check
print("Train NaNs")
print(train_df_cleaned.isna().sum()[train_df_cleaned.isna().sum() > 0])
print("Test NaNs")
print(test_df_cleaned.isna().sum()[test_df_cleaned.isna().sum() > 0])

Train NaNs
Series([], dtype: int64)
Test NaNs
Series([], dtype: int64)
