## Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
import warnings
from IPython.display import display
from sklearn.model_selection import train_test_split 
sb.set() 
warnings.filterwarnings('ignore')



## Data Import

In [2]:
data = pd.read_csv("Housing.csv")
data.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
0,3/1/2012,ANG MO KIO,2 ROOM,172,ANG MO KIO AVE 4,06 TO 10,45,Improved,1986,250000.0
1,3/1/2012,ANG MO KIO,2 ROOM,510,ANG MO KIO AVE 8,01 TO 05,44,Improved,1980,265000.0
2,3/1/2012,ANG MO KIO,3 ROOM,610,ANG MO KIO AVE 4,06 TO 10,68,New Generation,1980,315000.0
3,3/1/2012,ANG MO KIO,3 ROOM,474,ANG MO KIO AVE 10,01 TO 05,67,New Generation,1984,320000.0
4,3/1/2012,ANG MO KIO,3 ROOM,604,ANG MO KIO AVE 5,06 TO 10,67,New Generation,1980,321000.0


## Data cleaning

In [12]:
# Removing empty spaces from the columns
data.columns = data.columns.str.strip()

# Removing leading and trailing whitespaces
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Removing duplicate rows
data.drop_duplicates(inplace=True)

# Data types of columns
print("\nData types of columns:")
print(data.dtypes)

# Missing values
print("\nMissing values:")
print(data.isnull().sum())

# Dropping the rows with resale_price and floor_area_sqm
data = data.dropna(subset=['floor_area_sqm'])
data = data.dropna(subset=['resale_price'])


Data types of columns:
month                  datetime64[ns]
town                           object
flat_type                      object
block                          object
street_name                    object
storey_range                   object
floor_area_sqm                  int64
flat_model                     object
lease_commence_date             int64
resale_price                  float64
year                            int64
dtype: object

Missing values:
month                  0
town                   0
flat_type              0
block                  0
street_name            0
storey_range           0
floor_area_sqm         0
flat_model             0
lease_commence_date    0
resale_price           0
year                   0
dtype: int64


In [4]:
# Convert the 'month' column to datetime format
data['month'] = pd.to_datetime(data['month'], format='%d/%m/%Y')

# Extract the year from the 'month' column and create a new column named 'year'
data['year'] = data['month'].dt.year

# Save the DataFrame back to a CSV file
data.to_csv('cleaned-Housing.csv', index=False)  # Set index=False to avoid saving the DataFrame index as a column

In [5]:
# Define the lower and upper quantile thresholds
lower_quartile = data['resale_price'].quantile(0.25)
upper_quartile = data['resale_price'].quantile(0.75)
interquartile_range = upper_quartile - lower_quartile
lower_threshold = lower_quartile - 1.5*interquartile_range
upper_threshold = upper_quartile + 1.5*interquartile_range

# Filter out the outliers based on the resale price column
data = data[(data['resale_price'] >= lower_threshold) & (data['resale_price'] <= upper_threshold)]

In [6]:
pd.options.display.max_columns = None
display(data)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,year
0,2012-01-03,ANG MO KIO,2 ROOM,172,ANG MO KIO AVE 4,06 TO 10,45,Improved,1986,250000.0,2012
1,2012-01-03,ANG MO KIO,2 ROOM,510,ANG MO KIO AVE 8,01 TO 05,44,Improved,1980,265000.0,2012
2,2012-01-03,ANG MO KIO,3 ROOM,610,ANG MO KIO AVE 4,06 TO 10,68,New Generation,1980,315000.0,2012
3,2012-01-03,ANG MO KIO,3 ROOM,474,ANG MO KIO AVE 10,01 TO 05,67,New Generation,1984,320000.0,2012
4,2012-01-03,ANG MO KIO,3 ROOM,604,ANG MO KIO AVE 5,06 TO 10,67,New Generation,1980,321000.0,2012
...,...,...,...,...,...,...,...,...,...,...,...
52198,2014-01-12,YISHUN,5 ROOM,816,YISHUN ST 81,10 TO 12,122,Improved,1988,580000.0,2014
52199,2014-01-12,YISHUN,EXECUTIVE,325,YISHUN CTRL,10 TO 12,146,Maisonette,1988,540000.0,2014
52200,2014-01-12,YISHUN,EXECUTIVE,618,YISHUN RING RD,07 TO 09,164,Apartment,1992,738000.0,2014
52201,2014-01-12,YISHUN,EXECUTIVE,277,YISHUN ST 22,07 TO 09,152,Maisonette,1985,592000.0,2014


## Train-test split

In [7]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [8]:
train_data.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,year
2680,2012-01-04,BUKIT BATOK,4 ROOM,236,BT BATOK EAST AVE 5,01 TO 05,98,New Generation,1985,420000.0,2012
32722,2013-01-10,BUKIT BATOK,4 ROOM,410,BT BATOK WEST AVE 4,10 TO 12,99,New Generation,1988,390000.0,2013
33631,2013-01-10,TOA PAYOH,3 ROOM,232,LOR 8 TOA PAYOH,04 TO 06,76,Improved,1976,370000.0,2013
50323,2014-01-11,PUNGGOL,5 ROOM,638B,PUNGGOL DR,07 TO 09,112,Premium Apartment,2005,470000.0,2014
51497,2014-01-12,JURONG WEST,3 ROOM,464,JURONG WEST ST 41,04 TO 06,73,Model A,1984,300000.0,2014


In [9]:
test_data.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,year
13560,2012-01-09,CHOA CHU KANG,4 ROOM,425,CHOA CHU KANG AVE 4,04 TO 06,104,Model A,1992,430000.0,2012
14833,2012-01-09,YISHUN,4 ROOM,829,YISHUN ST 81,07 TO 09,103,Model A,1988,500000.0,2012
27701,2013-01-06,JURONG WEST,3 ROOM,186,BOON LAY AVE,19 TO 21,68,Improved,1975,315000.0,2013
28140,2013-01-06,TAMPINES,4 ROOM,338,TAMPINES ST 33,01 TO 03,106,Model A,1995,445000.0,2013
50620,2014-01-11,TAMPINES,5 ROOM,864A,TAMPINES ST 83,07 TO 09,122,Improved,1988,515000.0,2014


## Export

In [10]:
data.to_csv("cleaned-Housing.csv", index=False)
train_data.to_csv("train-Housing.csv", index=False)
test_data.to_csv("test-Housing.csv", index=False)