# Introduction
In this notebook, we will do the following:
1. one-hot encode flat_type
2. one-hot encode town
3. one-hot encode flat_model
4. combine the DataFrames containing the one-hot encoded categorical values with the original DataFrame
5. export the large DataFrame as CSV

In [1]:
import pandas as pd

In [2]:
# Load your CSV
raw_df = pd.read_csv('part2_result.csv')
raw_df.head(3)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,year,real_month,remaining_lease
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0,1990,1,86
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000.0,1990,1,86
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000.0,1990,1,86


In [3]:
# Declare a variable and store the dummified/one-hot encoded values from 'flat_type'
flat_type_df = pd.get_dummies(raw_df['flat_type'], prefix='Dummy',drop_first=True) #Drop 1 category to avoid dummy variable trap
flat_type_df.sample(5)

Unnamed: 0,Dummy_2 ROOM,Dummy_3 ROOM,Dummy_4 ROOM,Dummy_5 ROOM,Dummy_EXECUTIVE,Dummy_MULTI GENERATION
496048,0,0,0,1,0,0
15469,0,1,0,0,0,0
729231,0,0,0,1,0,0
733689,0,0,0,1,0,0
520506,0,1,0,0,0,0


In [5]:
# Declare a variable and store the dummified/one-hot encoded values from 'town'
town_df = pd.get_dummies(raw_df['town'], prefix='Dummy',drop_first=True)
town_df.sample(3)

Unnamed: 0,Dummy_BEDOK,Dummy_BISHAN,Dummy_BUKIT BATOK,Dummy_BUKIT MERAH,Dummy_BUKIT PANJANG,Dummy_BUKIT TIMAH,Dummy_CENTRAL AREA,Dummy_CHOA CHU KANG,Dummy_CLEMENTI,Dummy_GEYLANG,...,Dummy_PASIR RIS,Dummy_PUNGGOL,Dummy_QUEENSTOWN,Dummy_SEMBAWANG,Dummy_SENGKANG,Dummy_SERANGOON,Dummy_TAMPINES,Dummy_TOA PAYOH,Dummy_WOODLANDS,Dummy_YISHUN
691946,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
533751,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
246119,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
town_df.shape

(829008, 26)

In [7]:
# Declare a variable and store the dummified/one-hot encoded values from 'flat_model'
flat_model_df = pd.get_dummies(raw_df['flat_model'], prefix='Dummy', drop_first=True)
flat_model_df.sample(3)

Unnamed: 0,Dummy_ADJOINED FLAT,Dummy_APARTMENT,Dummy_DBSS,Dummy_IMPROVED,Dummy_IMPROVED-MAISONETTE,Dummy_MAISONETTE,Dummy_MODEL A,Dummy_MODEL A-MAISONETTE,Dummy_MODEL A2,Dummy_MULTI GENERATION,Dummy_NEW GENERATION,Dummy_PREMIUM APARTMENT,Dummy_PREMIUM APARTMENT LOFT,Dummy_PREMIUM MAISONETTE,Dummy_SIMPLIFIED,Dummy_STANDARD,Dummy_TERRACE,Dummy_TYPE S1,Dummy_TYPE S2
137406,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
41688,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
165108,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [17]:
# Concatenate all of the DataFrames together
df_list = [raw_df,flat_type_df,town_df,flat_model_df]
concat_df = pd.concat(df_list,axis=1,sort=False)
concat_df.sample(3)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,...,Dummy_MULTI GENERATION,Dummy_NEW GENERATION,Dummy_PREMIUM APARTMENT,Dummy_PREMIUM APARTMENT LOFT,Dummy_PREMIUM MAISONETTE,Dummy_SIMPLIFIED,Dummy_STANDARD,Dummy_TERRACE,Dummy_TYPE S1,Dummy_TYPE S2
587659,2009-10,BEDOK,3 ROOM,44,CHAI CHEE ST,07 TO 09,68.0,NEW GENERATION,1980,265000.0,...,0,1,0,0,0,0,0,0,0,0
167255,1997-08,KALLANG/WHAMPOA,4 ROOM,9,GLOUCESTER RD,07 TO 09,92.0,NEW GENERATION,1976,335000.0,...,0,1,0,0,0,0,0,0,0,0
317292,2000-11,HOUGANG,4 ROOM,617,HOUGANG AVE 8,01 TO 03,104.0,MODEL A,1986,257000.0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
concat_df.shape

(829008, 64)

In [23]:
# Export the expanded DataFrame as CSV
concat_df.to_csv('part4_result.csv', index=False)