In [1]:
import pandas as pd
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
price = pd.read_csv("../data/train.csv")

In [9]:
# Checking dimensions of dataset
price.shape

(150634, 78)

In [10]:
# Checking types of data
price.dtypes

id                     int64
Tranc_YearMonth       object
town                  object
flat_type             object
block                 object
                      ...   
sec_sch_name          object
cutoff_point           int64
affiliation            int64
sec_sch_latitude     float64
sec_sch_longitude    float64
Length: 78, dtype: object

In [8]:
# See missing values
price.isnull().sum()[40:60]

3room_rental                   0
other_room_rental              0
postal                         0
Latitude                       0
Longitude                      0
planning_area                  0
Mall_Nearest_Distance        829
Mall_Within_500m           92789
Mall_Within_1km            25426
Mall_Within_2km             1940
Hawker_Nearest_Distance        0
Hawker_Within_500m         97390
Hawker_Within_1km          60868
Hawker_Within_2km          29202
hawker_food_stalls             0
hawker_market_stalls           0
mrt_nearest_distance           0
mrt_name                       0
bus_interchange                0
mrt_interchange                0
dtype: int64

### Imputation 

With 150k rows in dataset, there is a significant proportion of missing data in rows such as Hawker_within_500m.

In order to perform data analysis, imputation of missing values via k-Nearest Neighbour method was chosen. 

After imputation, should remaining values remain missing and missing rows < 5%, rows will be dropped.

In [3]:
# KNNImputer with k=5
knn_imputer = KNNImputer(n_neighbors=5)
columns_with_missing_values = ['Hawker_Within_1km', 'Mall_Nearest_Distance', 'Mall_Within_1km']
price[columns_with_missing_values] = knn_imputer.fit_transform(price[columns_with_missing_values])

# Check if there are any missing values left
missing_after_imputation = price.isnull().sum()
print("Missing values after imputation:", missing_after_imputation[missing_after_imputation > 0])

KeyboardInterrupt: 

In [None]:
imputed_csv_local_path = "input_filepath_here"
price.to_csv(imputed_csv_local_path, index=False)

In [None]:
# KNN function took ages to run. This is part of EDA. Seperating the code out of the main code.
# Main code will read the newly saved file directly