# Clean Raw Data

> Let's do the bare minimum to clean the data

In [None]:
import pandas as pd
import re
import math
import os

In [None]:
resale_flat_prices_file_path = os.path.join('Raw Data', 'resale flat prices.csv')
resale_flat_prices_df = pd.read_csv(resale_flat_prices_file_path)

  interactivity=interactivity, compiler=compiler, result=result)


## Drop useless columns

> - _id does not provide any meaningful information
    > - street_name and block while meaningful, are categorical data that has too many variations and will introduce unneeded complexity (too many dimensions) into our model, we will explore how we can turn this into more meaningful info later!

In [None]:
# id doeesn't provide any meaningful information for us
resale_flat_prices_df.drop(columns=['_id', 'street_name', 'block'], inplace=True)

## remaining_lease to only be in years

> There are quite a few variations in how this is expressed. I will standardise this column to only be number of years left for the lease

In [None]:
years_regex = '(\d+) years?'

def get_remaining_lease(row):
    remaining_lease = row['remaining_lease']
    lease_commence_date = row['lease_commence_date']
    sale_date = row['month']

    # To take into account nan values. Assumption: HDBs sold have a 99 year lease
    if isinstance(remaining_lease, float) and math.isnan(remaining_lease):
        sale_date_year = int(sale_date[:4])
        remaining_lease_years = sale_date_year - lease_commence_date
        return remaining_lease_years 
    
    elif isinstance(remaining_lease, float):
        return int(remaining_lease)

    elif isinstance(remaining_lease, str):
        years_res = re.search(years_regex, remaining_lease)

        # To account for rows where this is expressed as x years y months
        if years_res is not None:
            remaining_lease_years = int(years_res.groups()[0])
            return remaining_lease_years

        # To take into account years that are strings
        else:
            return int(remaining_lease)

In [None]:
resale_flat_prices_df['remaining_lease'] = resale_flat_prices_df.apply(get_remaining_lease, axis=1)

## Save cleaned data frame

In [None]:
file_path = os.path.join('Cleaned Data', 'resale flat prices.csv')
resale_flat_prices_df.to_csv(file_path, index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8affb79e-2cc6-4186-86e6-a88dc9ffb80e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>