In [1]:
import sys
sys.path.append('../')

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from Functions.Helpers import classify_broker

pd.set_option('display.max_columns', None)

In [3]:
# Get the dataset and rename the columns.

df = pd.read_excel('../Felles/TrondheimHousingData.xlsx')
df = df.rename(columns = {'Adresse': 'Address', 
                          'Boligtype': 'Housing type', 
                          'Prom': 'Primary room', 
                          'BRA': 'Usable area', 
                          'BTA': 'Gross area', 
                          'Byggeår': 'Year of construction', 
                          'Etasje': 'Floor', 
                          'Balkong': 'Balcony', 
                          'Parkering': 'Parking', 
                          'Tomtestørrelse': 'Plot size', 
                          'Sist solgt': 'Last sold', 
                          'Pris': 'Price', 
                          'Fellesgjeld ved siste salg': 'Joint debt at last sale', 
                          'Totalpris ved siste salg': 'Total price at last sale', 
                          'Antall rom': 'Number of rooms', 
                          'Antall soverom': 'Number of bedrooms', 
                          'Hjemmelshaver': 'Owner', 
                          'Registrert dato': 'Registered date', 
                          'Omsetningshastighet (dager)': 'Turnover rate', 
                          'Megler': 'Broker'})

In [4]:
# Remove the Owner column since it does not contain pertinent information.

df.drop(columns = ['Owner'], inplace = True)

In [5]:
# Create a seperate column for the postal code of every residence, might be useful.

df['Postal code'] = df['Address'].str.split(' ').str[-2].astype(int)
df['Area name'] = df['Address'].str.split(' ').str[-1]

In [6]:
# Rewrite float values to integers for columns which do not need to be floats.

columns_to_modify = ['Number of rooms', 'Turnover rate', 'Number of bedrooms', 'Floor', 
                   'Year of construction', 'Usable area', 'Gross area', 'Primary room']
for column in columns_to_modify:
    df[column] = df[column].fillna(0).astype(int)

In [7]:
# Remove all datapoints which meet the following criterion:
# -     Missing dates in the form of "-" in the Last sold column (corresponds to missing dates in 
#       the registered date column).
# -     If the Housing type has been classified to others (because we cannot be sure what kind of 
#       residence these could be).
# -     If the Year of construction was NaN (now 0, see above).

mask = (df['Last sold'] != '-') & (df['Housing type'] != 'Annet') & (df['Year of construction'] != 0)
df = df[mask].reset_index(drop = True)

In [8]:
# Convert the dates to proper date-time format.

df['Registered date'] = pd.to_datetime(df['Registered date'], dayfirst = True)
df['Last sold'] = pd.to_datetime(df['Last sold'], dayfirst = True)

In [9]:
# Classify each broker into their respective mother company, ignore the branches of each company.
# Those companies not large enough, collect into an "Other" category. See Helpers.py.

df['Broker'] = df['Broker'].fillna('Placeholder').apply(classify_broker)

In [10]:
# Encode the brokers and housing types for further use.

encoder = LabelEncoder()

df['Broker encoded'] = encoder.fit_transform(df['Broker'])
df['Housing type encoded'] = encoder.fit_transform(df['Housing type'])

In [11]:
# Create dummy variables for Parking and Balcony Yes/No.

df = pd.get_dummies(df, columns = ['Balcony', 'Parking'], dtype = int)
df.drop(columns = ['Parking_Nei'], inplace = True)
df.rename(columns = {'Balcony_Ja': 'Balcony_Yes', 'Parking_Ja': 'Parking_Yes'}, inplace = True)

In [12]:
# Sort the datapoints in the dataframe by the date each residence was sold.

df = df.sort_values(by = 'Last sold').reset_index(drop = True)

In [13]:
# Add coordinates to the dataset based on the postal code in each datapoint.

dfCoordinates = pd.read_csv('../Felles/PostalCodesWithCoordinates.csv')
df = df.merge(dfCoordinates, how = 'left', on = 'Postal code')

In [14]:
# TEMPORARY (?)
# Fill the missing values in Plot size and Joint debt at last sale with 0.

df['Plot size'] = df['Plot size'].fillna(0)
df['Joint debt at last sale'] = df['Joint debt at last sale'].fillna(0)

In [15]:
df

Unnamed: 0,Matrikkel / Org-Anr,Address,Housing type,Primary room,Usable area,Gross area,Year of construction,Floor,Plot size,Last sold,Price,Joint debt at last sale,Total price at last sale,Number of rooms,Number of bedrooms,Registered date,Turnover rate,Broker,Postal code,Area name,Broker encoded,Housing type encoded,Balcony_Yes,Parking_Yes,Lat,Lon
0,5001-20/88/0/0,"Skogvegen 32, 7058 CHARLOTTENLUND",Selveier enebolig,441,517,567,2007,0,970.8,2021-02-28,16900000,0.0,16900000,0,8,2021-02-22,6,Eiendomsmegler 1,7058,CHARLOTTENLUND,3,4,0,0,63.4239,10.4927
1,5001-404/418/0/69,"Tellefsens gate 8, 7030 TRONDHEIM",Selveierleilighet,135,144,0,2019,5,3673.0,2021-03-01,12500000,0.0,12500000,4,3,2021-01-28,32,Eiendomsmegler 1,7030,TRONDHEIM,3,7,1,0,63.4206,10.3973
2,5001-76/267/0/15,"Siriusvegen 10 A, 7037 TRONDHEIM",Selveierleilighet,74,74,81,1995,3,9300.8,2021-03-01,3450000,50690.0,3500690,3,2,2021-02-23,6,Eiendomsmegler 1,7037,TRONDHEIM,3,7,1,0,63.3941,10.4113
3,5001-52/224/0/128,"Heggdalsringen 65, 7049 TRONDHEIM",Selveierleilighet,69,73,0,2019,5,4572.6,2021-03-01,4950000,0.0,4950000,3,2,2021-03-01,0,Other,7049,TRONDHEIM,8,7,0,0,63.4044,10.4535
4,5001-410/683/0/34,"Dyre Halses gate 11, 7042 TRONDHEIM",Selveierleilighet,55,55,60,2004,7,1043.2,2021-03-01,4380000,0.0,4380000,2,2,2021-02-25,4,DNB Eiendom,7042,TRONDHEIM,1,7,1,0,63.4364,10.4134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20583,948656132/1052,"Stallmestervegen 4 B, 7046 TRONDHEIM",Borettslagsleilighet,60,60,66,1973,2,27938.2,2024-02-28,3100000,93250.0,3193250,2,1,2024-02-15,13,DNB Eiendom,7046,TRONDHEIM,1,3,0,0,63.4247,10.4474
20584,950373520/388,"Othilienborgvegen 51 C, 7033 TRONDHEIM",Borettslagsleilighet,72,81,0,1967,2,996.4,2024-02-28,3050000,584112.0,3634112,3,2,2024-02-20,8,Eiendomsmegler 1,7033,TRONDHEIM,3,3,0,0,63.4022,10.4305
20585,5001-316/723/0/9,"Industriveien 1 K, 7072 HEIMDAL",Selveierleilighet,47,52,0,2020,2,2017.8,2024-02-28,2975000,0.0,2975000,2,1,2024-02-16,12,Eiendomsmegler 1,7072,HEIMDAL,3,7,1,0,63.3517,10.3650
20586,5001-49/277/0/29,"Stokkbekken 134, 7048 TRONDHEIM",Selveierleilighet,71,87,96,2014,2,5289.2,2024-02-28,4090000,0.0,4090000,3,2,2024-02-17,11,Heimdal Eiendomsmegling,7048,TRONDHEIM,4,7,1,0,63.4122,10.4696


In [16]:
# Write the cleaned dataframe to an excel file.

df.to_csv('../Felles/TrondheimHousingDataCleaned.csv', index = False)