In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('raw_data_v2.csv')

print(df.columns)

Index(['Unnamed: 0', 'title', 'price', 'n_rooms', 'area', 'year',
       'price_per_sqft', 'size_sqft', 'road_name', 'n_bedrooms', 'n_bathrooms',
       'district', 'region', 'scraped_date'],
      dtype='object')


In [3]:
df

Unnamed: 0.1,Unnamed: 0,title,price,n_rooms,area,year,price_per_sqft,size_sqft,road_name,n_bedrooms,n_bathrooms,district,region,scraped_date
0,0,335A SMITH STREET,"S$ 558,000",HDB 3-Room,Central Area,1983,S$ 865 psf,645 sqft,Smith Street,2 beds,2 baths,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
1,1,538 UPPER CROSS STREET,"S$ 620,000",HDB 3-Room,Central Area,1978,S$ 961 psf,645 sqft,Upper Cross Street,2 beds,1 bath,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
2,2,538 UPPER CROSS STREET,"S$ 620,000",HDB 3-Room,Central Area,1978,S$ 960 psf,646 sqft,Upper Cross Street,2 beds,1 bath,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
3,3,533 UPPER CROSS STREET,"S$ 620,000",HDB 3-Room,Central Area,1979,S$ 895 psf,693 sqft,Upper Cross Street,2 beds,2 baths,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
4,4,538 UPPER CROSS STREET,"S$ 620,000",HDB 3-Room,1,99 years,"S$ 9,688 psf",64 sqft,Upper Cross Street,2 beds,1 bath,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,1720,458A SENGKANG WEST ROAD,"S$ 660,000",HDB 4-Room,Sengkang,2019,S$ 659 psf,"1,001 sqft",Sengkang West Road,3 beds,2 baths,Seletar,North East,2025-03-25
1721,1721,441D FERNVALE ROAD,"S$ 738,000",HDB 4-Room,Sengkang,2010,S$ 737 psf,"1,001 sqft",Fernvale Road,3 beds,2 baths,Seletar,North East,2025-03-25
1722,1722,456A SENGKANG WEST ROAD,"S$ 399,000",HDB 2-Room,Sengkang,2019,S$ 772 psf,517 sqft,Sengkang West Road,1 bed,1 bath,Seletar,North East,2025-03-25
1723,1723,409B FERNVALE ROAD,"S$ 699,000",HDB 5PA (Premium Apartment),Sengkang,2004,S$ 565 psf,"1,238 sqft",Fernvale Road,3 beds,2 baths,Seletar,North East,2025-03-25


In [4]:
#Common case 1: values are in the wrong columns, such as road name in sqft column, size sqft in psf column -> move all to the right by 1 step (column)

def is_price_per_sqft(val):
    return isinstance(val, str) and 'psf' in val

def is_size_sqft(val):
    return isinstance(val, str) and 'sqft' in val

def is_road_name(val):
    return isinstance(val, str) and any(word in val.lower() for word in ['street', 'road', 'avenue', 'drive', 'boulevard', 'lorong'])

def is_year(val):
    return isinstance(val,str) and val.isdigit() and len(val)==4

for idx, row in df.iterrows():
    
    if is_price_per_sqft(row['year']): # check if price_per_sqft is mistakenly in year column
        df.at[idx, 'road_name'] = row['size_sqft'] #update road_name column as the val in size_sqft column
        df.at[idx, 'size_sqft'] = row['price_per_sqft']
        df.at[idx, 'price_per_sqft'] = row['year']
        if is_year(row['area']): # check if year is mistakenly in area column
            df.at[idx, 'year'] = row['area']  
        df.at[idx, 'year'] = "" # or fillna if needed

In [5]:
#Common case 2: area replaced with district code (numbers) -> replace with region

for idx, row in df.iterrows():
    if row['area'].isdigit():
        df.at[idx, 'area'] = row['region'].capitalize()
        

In [6]:
#Common case 3: year column is either empty string/non-year strings -> replace with median year of properties from the region

all_regions = set(df['region'])
for r in all_regions:
    df_a = df[df['region'] == r].copy()

    # Get the median year, ignoring non-numeric values
    median_year = pd.to_numeric(df_a['year'], errors='coerce').dropna().median()

    # Replace all non-digit year values with the median year
    df_a['year'] = df_a['year'].apply(
        lambda x: str(median_year) if not str(x).isdigit() else x
    )

    # Write the cleaned column back to the original dataframe
    df.loc[df['region'] == r, 'year'] = df_a['year']

print('no. of null rows of year:', len(df[df['year'] == ""]))

no. of null rows of year: 0


In [7]:
# Case 4: size sqft has 1 nan value -- drop row
# Drop rows where 'size_sqft' has NaN values
df = df.dropna(subset=['size_sqft'])
print("Rows with NaN values in 'size_sqft' column have been dropped.")


Rows with NaN values in 'size_sqft' column have been dropped.


In [8]:
# Count the number of empty strings in the 'size_sqft' column
empty_strings_count = (df['size_sqft'] == '').sum()

print(f"Number of empty strings in 'size_sqft': {empty_strings_count}")

Number of empty strings in 'size_sqft': 0


In [9]:
df = df[df['size_sqft'] != '']

In [10]:
df 

Unnamed: 0.1,Unnamed: 0,title,price,n_rooms,area,year,price_per_sqft,size_sqft,road_name,n_bedrooms,n_bathrooms,district,region,scraped_date
0,0,335A SMITH STREET,"S$ 558,000",HDB 3-Room,Central Area,1983,S$ 865 psf,645 sqft,Smith Street,2 beds,2 baths,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
1,1,538 UPPER CROSS STREET,"S$ 620,000",HDB 3-Room,Central Area,1978,S$ 961 psf,645 sqft,Upper Cross Street,2 beds,1 bath,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
2,2,538 UPPER CROSS STREET,"S$ 620,000",HDB 3-Room,Central Area,1978,S$ 960 psf,646 sqft,Upper Cross Street,2 beds,1 bath,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
3,3,533 UPPER CROSS STREET,"S$ 620,000",HDB 3-Room,Central Area,1979,S$ 895 psf,693 sqft,Upper Cross Street,2 beds,2 baths,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
4,4,538 UPPER CROSS STREET,"S$ 620,000",HDB 3-Room,Central,1984.0,"S$ 9,688 psf",64 sqft,Upper Cross Street,2 beds,1 bath,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,1720,458A SENGKANG WEST ROAD,"S$ 660,000",HDB 4-Room,Sengkang,2019,S$ 659 psf,"1,001 sqft",Sengkang West Road,3 beds,2 baths,Seletar,North East,2025-03-25
1721,1721,441D FERNVALE ROAD,"S$ 738,000",HDB 4-Room,Sengkang,2010,S$ 737 psf,"1,001 sqft",Fernvale Road,3 beds,2 baths,Seletar,North East,2025-03-25
1722,1722,456A SENGKANG WEST ROAD,"S$ 399,000",HDB 2-Room,Sengkang,2019,S$ 772 psf,517 sqft,Sengkang West Road,1 bed,1 bath,Seletar,North East,2025-03-25
1723,1723,409B FERNVALE ROAD,"S$ 699,000",HDB 5PA (Premium Apartment),Sengkang,2004,S$ 565 psf,"1,238 sqft",Fernvale Road,3 beds,2 baths,Seletar,North East,2025-03-25


In [11]:
# length and null checks on cleaned df 
print(len(df[df['price_per_sqft'].str.contains('psf')]))
print(len(df[df['size_sqft'].str.contains('sqft')]))
print(len(df[df['n_bathrooms'].str.contains('bath')]))
print(len(df[df['n_bedrooms'].str.contains('bed')]))
print('no. of null rows of year:', len(df[df['year'] == ""]))
print('no. of null rows of road:', len(df[df['road_name'] == ""]))


1722
1722
1724
1724
no. of null rows of year: 0
no. of null rows of road: 0


In [12]:
# Count the number of empty strings in the 'size_sqft' column
empty_strings_count = (df['size_sqft'] == '').sum()

print(f"Number of empty strings in 'size_sqft': {empty_strings_count}")

Number of empty strings in 'size_sqft': 0


In [None]:
# Formatting
# Clean currency & psf strings
df['price'] = df['price'].replace(r'[^\d.]', '', regex=True).astype(float)
df['price_per_sqft'] = df['price_per_sqft'].replace(r'[^\d.]', '', regex=True).astype(float)

# Clean size sqft strings
df['size_sqft'] = df['size_sqft'].replace(r'[^\d,]', '', regex=True)  # remove non-digit, non-comma characters
df['size_sqft'] = df['size_sqft'].apply(lambda x: x.replace(',', ''))  # remove commas and convert to float


# Number of Bedrooms/bathrooms
df['n_bedrooms'] = df['n_bedrooms'].str.extract(r'(\d+)').astype(int)
df['n_bathrooms'] = df['n_bathrooms'].str.extract(r'(\d+)').astype(int)

# Time 
df['year'] = pd.to_numeric(df['year'], downcast="integer")
df['scraped_date'] = pd.to_datetime(df['scraped_date'], errors='coerce')

0        645
1        645
2        646
3        693
4         64
        ... 
1720    1001
1721    1001
1722     517
1723    1238
1724    1011
Name: size_sqft, Length: 1724, dtype: object


In [14]:
df

Unnamed: 0.1,Unnamed: 0,title,price,n_rooms,area,year,price_per_sqft,size_sqft,road_name,n_bedrooms,n_bathrooms,district,region,scraped_date
0,0,335A SMITH STREET,558000.0,HDB 3-Room,Central Area,1983,865.0,645,Smith Street,2,2,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
1,1,538 UPPER CROSS STREET,620000.0,HDB 3-Room,Central Area,1978,961.0,645,Upper Cross Street,2,1,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
2,2,538 UPPER CROSS STREET,620000.0,HDB 3-Room,Central Area,1978,960.0,646,Upper Cross Street,2,1,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
3,3,533 UPPER CROSS STREET,620000.0,HDB 3-Room,Central Area,1979,895.0,693,Upper Cross Street,2,2,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
4,4,538 UPPER CROSS STREET,620000.0,HDB 3-Room,Central,1984,9688.0,64,Upper Cross Street,2,1,"Raffles Place, Cecil, Marina, People's Park",Central,2025-03-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,1720,458A SENGKANG WEST ROAD,660000.0,HDB 4-Room,Sengkang,2019,659.0,1001,Sengkang West Road,3,2,Seletar,North East,2025-03-25
1721,1721,441D FERNVALE ROAD,738000.0,HDB 4-Room,Sengkang,2010,737.0,1001,Fernvale Road,3,2,Seletar,North East,2025-03-25
1722,1722,456A SENGKANG WEST ROAD,399000.0,HDB 2-Room,Sengkang,2019,772.0,517,Sengkang West Road,1,1,Seletar,North East,2025-03-25
1723,1723,409B FERNVALE ROAD,699000.0,HDB 5PA (Premium Apartment),Sengkang,2004,565.0,1238,Fernvale Road,3,2,Seletar,North East,2025-03-25


In [15]:
df.to_csv('cleaned_data_v2.csv') 