# Data Cleaning 
Purpose: to get the actual resale prices by removing inflation rates using Resale Price Index

In [1]:
# Import python libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Reading csv file and getting the first 5 observations
df = pd.read_csv('resale_flat_prices.csv')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,_id,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
0,1,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,232000.0
1,2,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,250000.0
2,3,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,262000.0
3,4,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,265000.0
4,5,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,265000.0


In [3]:
# Define functions to be used in this notebook

# Function to extract the year section and convert to INT
def getyear(text):
    return int(text[0:4])

# Function to extract the quarter section
def getquarter(text):
    return text[5:7]

# Function to calculate HDB age range
def calculate_age_range(age):
    if age in range(0,10):
        return "0-9"
    elif age in range(10,20):
        return "10-19"
    elif age in range(20,30):
        return "20-29"
    elif age in range(30,40):
        return "30-39"
    else:
        return "40 and above"
    
# Function to calculate HDB year range
def calculate_year_range(year):
    if year in range(1990,1994):
        return "1990-1994"
    elif year in range(1995,1999):
        return "1995-1999"
    elif year in range(2000,2004):
        return "2000-2004"
    elif year in range(2005,2009):
        return "2005-2009"
    elif year in range(2010,2014):
        return "2010-2014"
    else:
        return "2015-2019"

In [4]:
# Extract the year value from month column
df['year'] = df['month'].apply(getyear)

# Calculate the year range of flat during the transaction year
df['year_range'] = df['year'].apply(calculate_year_range)

# Calculate the age value of flat during the transaction year
df['age'] = df['year'] - df['lease_commence_date']

# Calculate the age range of flat during the transaction year
df['age_range'] = df['age'].apply(calculate_age_range)

df.head()

Unnamed: 0,_id,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,year,year_range,age,age_range
0,1,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,232000.0,2017,2015-2019,38,30-39
1,2,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,250000.0,2017,2015-2019,39,30-39
2,3,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,262000.0,2017,2015-2019,37,30-39
3,4,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,265000.0,2017,2015-2019,37,30-39
4,5,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,265000.0,2017,2015-2019,37,30-39


In [5]:
# Reading Resale Price Index file
rpi = pd.read_csv("housing-and-development-board-resale-price-index-1q2009-100-quarterly.csv")
rpi.head()

Unnamed: 0,quarter,index
0,1990-Q1,24.3
1,1990-Q2,24.4
2,1990-Q3,25.0
3,1990-Q4,24.7
4,1991-Q1,24.9


In [6]:
# Understanding the data types of each columns
rpi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 2 columns):
quarter    117 non-null object
index      117 non-null float64
dtypes: float64(1), object(1)
memory usage: 2.0+ KB


'quarter' column has an object data type and 'index' column has a float data type

In [7]:
# Function to extract the year section and convert to INT
def getyear(text):
    return int(text[0:4])

# Extract the year value from month column
rpi['year'] = rpi['quarter'].apply(getyear)

# Replace 'quarter' column with 'year'
rpi = rpi[['year','index']]
rpi.head()

Unnamed: 0,year,index
0,1990,24.3
1,1990,24.4
2,1990,25.0
3,1990,24.7
4,1991,24.9


In [8]:
year = df.sort_values('year')['year'].unique()
index = rpi.sort_values('year').groupby(['year'])['index'].mean().values

# Create new dataframe to merge columns from both rpi and df datasets
new_df = pd.DataFrame()
new_df.loc[:,'year'] = year
new_df.loc[:,'index'] = index
new_df.head()

Unnamed: 0,year,index
0,1990,24.6
1,1991,25.175
2,1992,27.45
3,1993,41.625
4,1994,52.875


In [10]:
# Formula to get actual resale price: new_df[real_price] = new_df['resale_price'] * (100 / new_df['index'])
new_df['RPI'] = 100 / new_df['index']
new_df = new_df[['year','RPI']]
new_df.head()

Unnamed: 0,year,RPI
0,1990,4.065041
1,1991,3.972195
2,1992,3.642987
3,1993,2.402402
4,1994,1.891253


In [17]:
# Merging resale flat prices and resale price index datasets
merged_df = pd.merge(df, new_df, how='left', on='year')
merged_df.tail()

Unnamed: 0,_id,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,year,year_range,age,age_range,RPI
1225233,1184931,1999-12,YISHUN,EXECUTIVE,611,YISHUN ST 61,10 TO 12,142.0,APARTMENT,1987,,456000.0,1999,2015-2019,12,10-19,1.32758
1225234,1184932,1999-12,YISHUN,EXECUTIVE,324,YISHUN CTRL,01 TO 03,142.0,APARTMENT,1988,,408000.0,1999,2015-2019,11,10-19,1.32758
1225235,1184933,1999-12,YISHUN,EXECUTIVE,392,YISHUN AVE 6,07 TO 09,146.0,MAISONETTE,1988,,469000.0,1999,2015-2019,11,10-19,1.32758
1225236,1184934,1999-12,YISHUN,EXECUTIVE,356,YISHUN RING RD,04 TO 06,146.0,MAISONETTE,1988,,440000.0,1999,2015-2019,11,10-19,1.32758
1225237,1184935,1999-12,YISHUN,EXECUTIVE,358,YISHUN RING RD,01 TO 03,145.0,MAISONETTE,1988,,484000.0,1999,2015-2019,11,10-19,1.32758


In [21]:
# Creating a new 'real_price' column for more accurate price of the resale flat prices
merged_df['real_price'] = merged_df['resale_price'] * merged_df['RPI']

In [22]:
merged_df

Unnamed: 0,_id,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,year,year_range,age,age_range,RPI,real_price
0,1,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,232000.0,2017,2015-2019,38,30-39,0.750469,174108.818011
1,2,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,250000.0,2017,2015-2019,39,30-39,0.750469,187617.260788
2,3,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,262000.0,2017,2015-2019,37,30-39,0.750469,196622.889306
3,4,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,265000.0,2017,2015-2019,37,30-39,0.750469,198874.296435
4,5,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,265000.0,2017,2015-2019,37,30-39,0.750469,198874.296435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225233,1184931,1999-12,YISHUN,EXECUTIVE,611,YISHUN ST 61,10 TO 12,142.0,APARTMENT,1987,,456000.0,1999,2015-2019,12,10-19,1.327580,605376.700962
1225234,1184932,1999-12,YISHUN,EXECUTIVE,324,YISHUN CTRL,01 TO 03,142.0,APARTMENT,1988,,408000.0,1999,2015-2019,11,10-19,1.327580,541652.837703
1225235,1184933,1999-12,YISHUN,EXECUTIVE,392,YISHUN AVE 6,07 TO 09,146.0,MAISONETTE,1988,,469000.0,1999,2015-2019,11,10-19,1.327580,622635.247262
1225236,1184934,1999-12,YISHUN,EXECUTIVE,356,YISHUN RING RD,04 TO 06,146.0,MAISONETTE,1988,,440000.0,1999,2015-2019,11,10-19,1.327580,584135.413209


In [23]:
merged_df.to_csv('edited_resale_flat_prices.csv', encoding='utf-8', index=False)