In [25]:
import pandas as pd
import numpy as np

# Data restoration and encoding
Housing data has been scraped from web and contains many null values.
Those are tried to retrive using logical relationships between exisisting values,
and plain reasoning.

In [26]:
'''
All row without price, date, and travel time information are definetally useless
'''
df = pd.read_csv('raw_housing_data.csv')
df = df[df['Paivamaara'].notna()]
df = df[df['Velaton_hinta'].notna()]
df = df[df['TravelTime'].notna()]
df = df.reset_index(drop=True)

'''
We are only interested about normal ownership houses
'''
df = df[df['Asumistyyppi'] == 'Omistus']

'''
Show fill rate of values
'''
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percentage_missing': percent_missing})
missing_value_df.sort_values('percentage_missing', inplace=True)
missing_value_df = missing_value_df.reset_index(drop=True)

display(missing_value_df)

Unnamed: 0,column_name,percentage_missing
0,Paivamaara,0.0
1,Longitude,0.0
2,Latitude,0.0
3,Huoneita,0.0
4,Coordinates,0.0
5,Velaton_hinta,0.0
6,Asuinpinta_ala,0.0
7,TravelTime,0.0
8,Kaupunki,0.0
9,Asumistyyppi,0.0


In [27]:
'''
Condominium payment should be as sum of all other ownership related payments
by definition, thus missin values may be adjusted by summing all other found fees together
'''

df['Yhtiovastike'].fillna(df['Hoitovastike'].fillna(0) + 
                          df['Rahoitusvastike'].fillna(0) + 
                          df['Tontin_vuokravastike'].fillna(0), inplace = True)

df.loc[df['Yhtiovastike'] < 10, 'Yhtiovastike'] = 0

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percentage_missing': percent_missing})
missing_value_df.sort_values('percentage_missing', inplace=True)
missing_value_df = missing_value_df.reset_index(drop=True)

display(missing_value_df)

Unnamed: 0,column_name,percentage_missing
0,Paivamaara,0.0
1,Longitude,0.0
2,Latitude,0.0
3,Huoneita,0.0
4,Coordinates,0.0
5,Yhtiovastike,0.0
6,Velaton_hinta,0.0
7,Asuinpinta_ala,0.0
8,TravelTime,0.0
9,Kaupunki,0.0


In [28]:
'''
Similar idea for running costs
'''

df['Hoitovastike'].fillna(df['Yhtiovastike'].fillna(0) - 
                          df['Rahoitusvastike'].fillna(0) - 
                          df['Tontin_vuokravastike'].fillna(0), inplace = True)

df.loc[df['Hoitovastike'] < 10, 'Hoitovastike'] = 0

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percentage_missing': percent_missing})
missing_value_df.sort_values('percentage_missing', inplace=True)
missing_value_df = missing_value_df.reset_index(drop=True)

display(missing_value_df)

Unnamed: 0,column_name,percentage_missing
0,Paivamaara,0.0
1,Longitude,0.0
2,Latitude,0.0
3,Huoneita,0.0
4,Coordinates,0.0
5,Hoitovastike,0.0
6,Yhtiovastike,0.0
7,Velaton_hinta,0.0
8,Asuinpinta_ala,0.0
9,TravelTime,0.0


In [29]:
'''
And for financial costs
'''

df['Rahoitusvastike'].fillna(df['Yhtiovastike'].fillna(0) - 
                          df['Hoitovastike'].fillna(0) - 
                          df['Tontin_vuokravastike'].fillna(0), inplace = True)

df.loc[df['Rahoitusvastike'] < 10, 'Rahoitusvastike'] = 0

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percentage_missing': percent_missing})
missing_value_df.sort_values('percentage_missing', inplace=True)
missing_value_df = missing_value_df.reset_index(drop=True)

display(missing_value_df)

Unnamed: 0,column_name,percentage_missing
0,Paivamaara,0.0
1,Longitude,0.0
2,Latitude,0.0
3,Huoneita,0.0
4,Coordinates,0.0
5,Rahoitusvastike,0.0
6,Hoitovastike,0.0
7,Yhtiovastike,0.0
8,Velaton_hinta,0.0
9,Asuinpinta_ala,0.0


In [30]:
'''
And for land rent fee
'''

df['Tontin_vuokravastike'].fillna(df['Yhtiovastike'].fillna(0) - 
                          df['Hoitovastike'].fillna(0) - 
                          df['Rahoitusvastike'].fillna(0), inplace = True)

def transformer(row):
    if row['Tontin_omistus'] != np.nan:
        if row['Tontin_omistus'] == 'Oma':
            return 0.0
        else:
            return row['Tontin_vuokravastike']
    else:
       return row['Tontin_vuokravastike']
    
df['Tontin_vuokravastike'] = df.apply(lambda x: transformer(x), axis=1)

df.loc[df['Tontin_vuokravastike'] < 10, 'Tontin_vuokravastike'] = 0


percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percentage_missing': percent_missing})
missing_value_df.sort_values('percentage_missing', inplace=True)
missing_value_df = missing_value_df.reset_index(drop=True)

display(missing_value_df)

Unnamed: 0,column_name,percentage_missing
0,Paivamaara,0.0
1,Longitude,0.0
2,Latitude,0.0
3,Huoneita,0.0
4,Coordinates,0.0
5,Tontin_vuokravastike,0.0
6,Rahoitusvastike,0.0
7,Hoitovastike,0.0
8,Yhtiovastike,0.0
9,Velaton_hinta,0.0


In [31]:
'''
If apartemt has a land rent fee of 0€, then it's located on its own land
Or if it's possible to byu the land, then its on rent land

Encoding:
3 'Oma' - own land
2 'Valinnainen vuokratontti' - possibility to buy your part of the land
1 'Vuokralla' - rental land
'''
print(f'Unique values of landownership: {df.Tontin_omistus.unique()}')

df.loc[df['Tontin_omistus'] == 'Oma', 'Tontin_omistus'] = 3
df.loc[df['Tontin_omistus'] == 'Valinnainen vuokratontti', 'Tontin_omistus'] = 2
df.loc[df['Tontin_omistus'] == 'Vuokralla', 'Tontin_omistus'] = 1

def transformer(row):
    if row['Tontin_omistus'] == np.nan:
        if row['Tontin_vuokravastike'] == 0:
            return 3
        elif row['Vuokratontin_lunastusosuus'] != 0:
            return 2
        else:
            return np.nan
    else:
       return row['Tontin_omistus']
    
df['Tontin_omistus'] = df.apply(lambda x: transformer(x), axis=1)


'''
If land ownership is not 2 'Valinnainen vuokratontti', then land buyng price is 0
'''
def transformer(row):
    if row['Tontin_omistus'] == 2:
        return row['Vuokratontin_lunastusosuus']
    else:
       return 0
    
df['Vuokratontin_lunastusosuus'] = df.apply(lambda x: transformer(x), axis=1)

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percentage_missing': percent_missing})
missing_value_df.sort_values('percentage_missing', inplace=True)
missing_value_df = missing_value_df.reset_index(drop=True)

display(missing_value_df)

print(f'Unique values of landownership: {df.Tontin_omistus.unique()}')

Unique values of landownership: ['Oma' 'Vuokralla' nan 'Valinnainen vuokratontti']


Unnamed: 0,column_name,percentage_missing
0,Paivamaara,0.0
1,Longitude,0.0
2,Latitude,0.0
3,Huoneita,0.0
4,Coordinates,0.0
5,Tontin_vuokravastike,0.0
6,Rahoitusvastike,0.0
7,Hoitovastike,0.0
8,Yhtiovastike,0.0
9,Velaton_hinta,0.0


Unique values of landownership: [ 3.  1. nan  2.]


In [32]:
'''
If the building year is larger than the date,
then its a new building.
Also if the condition says its new, then building has just finished.

Encoding:
1 'Kylla' - its a brand new building
0 'Ei' - its NOT a brand new building

1 'Välttävä' - in bad shape
2 'Tyydyttävä' - beareble shape
3 'Hyvä' - ok
4 'Erinomainen' - excellent
5 'Uusi' - new
'''
print(f'Unique values of new building flag: {df.Uudiskohde.unique()}')
print(f'Unique values of condition: {df.Kunto.unique()}')

df.loc[df['Uudiskohde'] == 'Kyllä', 'Uudiskohde'] = 1
df.loc[df['Uudiskohde'] == 'Ei', 'Uudiskohde'] = 0

def transformer(row):
    if row['Rakennusvuosi'] > float(row['Paivamaara'][:4]):
        return 1
    elif row['Kunto'] == 'Uusi':
        return 1
    else:
       return row['Uudiskohde']
    
df['Uudiskohde'] = df.apply(lambda x: transformer(x), axis=1)


df.loc[df['Kunto'] == 'Välttävä', 'Kunto'] = 1
df.loc[df['Kunto'] == 'Tyydyttävä', 'Kunto'] = 2
df.loc[df['Kunto'] == 'Hyvä', 'Kunto'] = 3
df.loc[df['Kunto'] == 'Erinomainen', 'Kunto'] = 4
df.loc[df['Kunto'] == 'Uusi', 'Kunto'] = 5


percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percentage_missing': percent_missing})
missing_value_df.sort_values('percentage_missing', inplace=True)
missing_value_df = missing_value_df.reset_index(drop=True)

display(missing_value_df)

print(f'Unique values of new building flag: {df.Uudiskohde.unique()}')
print(f'Unique values of condition: {df.Kunto.unique()}')

Unique values of new building flag: ['Ei' 'Kyllä' nan]
Unique values of condition: ['Hyvä' 'Tyydyttävä' 'Uusi' 'Erinomainen' nan 'Välttävä']


Unnamed: 0,column_name,percentage_missing
0,Paivamaara,0.0
1,Longitude,0.0
2,Latitude,0.0
3,Huoneita,0.0
4,Coordinates,0.0
5,Tontin_vuokravastike,0.0
6,Rahoitusvastike,0.0
7,Hoitovastike,0.0
8,Yhtiovastike,0.0
9,Velaton_hinta,0.0


Unique values of new building flag: [ 0.  1. nan]
Unique values of condition: [3 2 5 4 nan 1]


In [33]:
'''
Balcony, sauna and beach are goof additions,
and I assume that if those would be included. it woudl have been stated.
Thus Nans and nopes are both 0.

Encoding:
1 'Kylla' - incuded
0 'Ei' - not included
'''

print(f'Unique values of sauna flag: {df.Sauna.unique()}')
print(f'Unique values of balcony: {df.Parveke.unique()}')
print(f'Unique values of beach: {df.Ranta.unique()}')


df.loc[df['Parveke'] == 'Kyllä', 'Parveke'] = 1
df.loc[df['Parveke'] != 1, 'Parveke'] = 0

df.loc[df['Sauna'] == 'Kyllä', 'Sauna'] = 1
df.loc[df['Sauna'] != 1, 'Sauna'] = 0

df['Ranta'].fillna(0, inplace=True)
df.loc[df['Ranta'] == 'Ei rantaa', 'Ranta'] = 0
df.loc[df['Ranta'] != 0, 'Ranta'] = 1

df.loc[df['Kerros'] != df['Korkein_kerros'], 'Korkein_kerros'] = 0
df.loc[df['Korkein_kerros'] != 0, 'Korkein_kerros'] = 1

df.loc[df['Rakennusvuosi'] < 1700, 'Rakennusvuosi'] = np.nan


percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percentage_missing': percent_missing})
missing_value_df.sort_values('percentage_missing', inplace=True)
missing_value_df = missing_value_df.reset_index(drop=True)

display(missing_value_df)

print(f'Unique values of sauna flag: {df.Sauna.unique()}')
print(f'Unique values of balcony: {df.Parveke.unique()}')
print(f'Unique values of beach: {df.Ranta.unique()}')

Unique values of sauna flag: ['Kyllä' nan 'Ei']
Unique values of balcony: ['Kyllä' nan 'Ei']
Unique values of beach: [nan 'Ei rantaa' 'Muu ranta' 'Rantaoikeus']


Unnamed: 0,column_name,percentage_missing
0,Paivamaara,0.0
1,Longitude,0.0
2,Latitude,0.0
3,Ranta,0.0
4,Sauna,0.0
5,Parveke,0.0
6,Korkein_kerros,0.0
7,Huoneita,0.0
8,Coordinates,0.0
9,Tontin_vuokravastike,0.0


Unique values of sauna flag: [1 0]
Unique values of balcony: [1 0]
Unique values of beach: [0 1]


In [34]:
'''
It's hard to process string information of energy level,
but it seems that if there is only 6 characters,
its in form of A2022 or A/2022.
This should give at least some information.

Encoding:
1 'A' - the best
2 'B' - second best
...
...
..
'''

print(f'Unique values of energy class: {df.Energialuokka.unique()}')


def transformer(row):
    if row['Energialuokka'] is not np.nan and len(row['Energialuokka']) <= 6:
        return row['Energialuokka'][0]
    else:
       return np.nan
    
df['Energialuokka'] = df.apply(lambda x: transformer(x), axis=1)

df['Energialuokka'] = df['Energialuokka'].apply(lambda x: ord(x.lower()) - 96 if x is not np.nan else np.nan)

df.loc[df['Energialuokka'] < 0, 'Energialuokka'] = np.nan


percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percentage_missing': percent_missing})
missing_value_df.sort_values('percentage_missing', inplace=True)
missing_value_df = missing_value_df.reset_index(drop=True)

display(missing_value_df)

print(f'Unique values of energy class: {df.Energialuokka.unique()}')

Unique values of energy class: ['Energialuokka: E2007' 'E2018' 'G2013'
 'Energialuokka: F2013, Energiatodistuksen voimassaoloaika: 21.02.2026'
 'D2007' 'A2018' 'E2013' 'F2018' 'C2013' 'B2018'
 'Energialuokka: G2013, Energiatodistuksen voimassaoloaika: 14.01.2025'
 'F/2013' 'F2013' 'E2007' 'G2013 Viimeinen voimassaolopäivä 4.11.2024.'
 'F/2007' 'C2007' 'E2007 Viimeinen voimassaolo 30.1.2023.' 'G/2007'
 'F2007'
 'Energialuokka: D2018, Energiatodistuksen voimassaoloaika: 28.12.2032'
 'E/2013' 'G2013 Voimassa 4.9.2024' 'D2007 Voimassa 19.5.2023 asti' 'A'
 'Energialuokka: E2013, Energiatodistuksen voimassaoloaika: 22.03.2025'
 'D2018' 'C2018'
 'B2018 Todistuksen laatimispäivä 28.5.2020.\nViimeinen voimassaolopäivä 28.5.2030.'
 'D2013' 'C/2013'
 'C2007 Taloyhtiön energiatodistuksen voimassaoloaika päättyy 24.11.2022'
 'F2013 Todistus laadittu 22.12.2015.' 'A/2018' 'G/2013' 'C, 2013' 'G2007'
 'D2018 Viimeinen voimassaolopäivä 23.08.2028' 'D/2007' 'G2018' 'B'
 'Energialuokka: A2018' 'E2007 Voi

Unnamed: 0,column_name,percentage_missing
0,Paivamaara,0.0
1,Longitude,0.0
2,Latitude,0.0
3,Ranta,0.0
4,Sauna,0.0
5,Parveke,0.0
6,Korkein_kerros,0.0
7,Huoneita,0.0
8,Coordinates,0.0
9,Tontin_vuokravastike,0.0


Unique values of energy class: [nan  5.  7.  4.  1.  6.  3.  2.]


In [35]:
'''
If renovation info contains LVI (very expensive bluming and electricity renovation)
flag it.

Encoding:
1 contains LVI
0 does not contain LVI

'''

def transformer(row):
    if row['Tulevat_remontit'] is not np.nan:
        if 'LVI' in row['Tulevat_remontit'] or 'lvi' in row['Tulevat_remontit']:
            return 1
        else:
            return 0
    else:
       return 0
    
df['Tulevat_remontit'] = df.apply(lambda x: transformer(x), axis=1)


def transformer(row):
    if row['Tehdyt_remontit'] is not np.nan:
        if 'LVI' in row['Tehdyt_remontit'] or 'lvi' in row['Tehdyt_remontit']:
            return 1
        else:
            return 0
    else:
       return 0
    
df['Tehdyt_remontit'] = df.apply(lambda x: transformer(x), axis=1)


percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percentage_missing': percent_missing})
missing_value_df.sort_values('percentage_missing', inplace=True)
missing_value_df = missing_value_df.reset_index(drop=True)

display(missing_value_df)

print(f'Unique values of upcoming renovations: {df.Tulevat_remontit.unique()}')
print(f'Unique values of past renovations: {df.Tehdyt_remontit.unique()}')

Unnamed: 0,column_name,percentage_missing
0,Paivamaara,0.0
1,Longitude,0.0
2,Latitude,0.0
3,Tehdyt_remontit,0.0
4,Tulevat_remontit,0.0
5,Ranta,0.0
6,Sauna,0.0
7,Parveke,0.0
8,Korkein_kerros,0.0
9,Huoneita,0.0


Unique values of upcoming renovations: [1 0]
Unique values of past renovations: [1 0]


In [36]:
'''
Encoding:
1 Vantaa
2 Espoo
3 Helsinki

'''

print(f'Unique values of city: {df.Kaupunki.unique()}')

df.loc[df['Kaupunki'] == 'Vantaa', 'Kaupunki'] = 1
df.loc[df['Kaupunki'] == 'Espoo', 'Kaupunki'] = 2
df.loc[df['Kaupunki'] == 'ESPOO', 'Kaupunki'] = 2
df.loc[df['Kaupunki'] == 'Helsinki', 'Kaupunki'] = 3
df.loc[df['Kaupunki'] == 'HELSINKI', 'Kaupunki'] = 3

print(f'Unique values of city: {df.Kaupunki.unique()}')

Unique values of city: ['Espoo' 'Helsinki' 'Vantaa' 'ESPOO']
Unique values of city: [2 3 1]


In [37]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

'''
Encoding:
Districts are dummified randomly
'''

print(f'Unique values of district: {df.Kaupunginosa.unique()}')

df['Kaupunginosa'] = label_encoder.fit_transform(df['Kaupunginosa'])

print(f'Unique values of district: {df.Kaupunginosa.unique()}')

Unique values of district: ['Leppävaara' 'Herttoniemi' 'Ullanlinna' 'Sörnäinen' 'Tapiola' 'Kallio'
 'Pakila' 'Tikkurila' 'Vallila' 'Pitäjänmäki' 'Myyrmäki' 'Pasila'
 'Hakunila' 'Espoon keskus' 'Vermonniitty' 'Kera' 'Kilo' 'Viherlaakso'
 'Kannelmäki' 'Tali' 'Munkkivuori' 'Pohjois-Haaga' 'Alppila' 'Järvenperä'
 'Vapaala' 'Saunalahti' 'Arabianranta' 'Kamppi' 'Hermanni'
 'Viikinranta/Vanhankaupunginkoski' 'Myllypuro' 'Pakkala' 'Kuitinmäki'
 'Haukilahti' 'Kartanonkoski' 'Herttoniemenranta' 'Taka-Töölö' 'Puotila'
 'Etelä-Haaga' 'Viikki' 'Tapaninkylä' 'Lauttasaari' 'Olari' 'Pisa'
 'Viikinmäki' 'Etu-Töölö' 'Pohjois-Tapiola' 'Ylästö' 'Niittykumpu' 'Malmi'
 'Harju' 'Aurinkolahti' 'Maunula' 'Matinkylä' 'Laajasalo' 'Matinkyä'
 'Simonmetsä' 'Roihuvuori' 'Länsi-Pasila' 'Punavuori' 'Pukinmäki'
 'Kumpula' 'Soukka' 'Kaivoksela' 'Kruunuvuorenranta' 'Meilahti' 'Yliskylä'
 'Tillinmäki' 'Tontunmäki' 'Pajamäki' 'Vuosaari'
 'Puolarmetsä Friisilä Olari' 'Tapaninvainio' 'Munkkisaari' 'Haaga'
 'Tapanila' 'Leine

In [38]:
display(df.head())

display(df.info())

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percentage_missing': percent_missing})
missing_value_df.sort_values('percentage_missing', inplace=True)
missing_value_df = missing_value_df.reset_index(drop=True)

display(missing_value_df)

Unnamed: 0,Paivamaara,Kohdenumero,Kaupunki,Kaupunginosa,Osoite,Kunto,Asumistyyppi,Uudiskohde,Tontin_omistus,Vuokratontin_lunastusosuus,...,Ranta,Rakennusvuosi,Energialuokka,Huoneistojen_lukumaara,Tulevat_remontit,Tehdyt_remontit,Latitude,Longitude,Coordinates,TravelTime
0,2023-03-02,70063685.0,2,112,"Kivenhakkaajankuja 3 A, 02650 Espoo",3,Omistus,0.0,3.0,0.0,...,0,1995.0,,63.0,1,1,60.222933,24.809138,"60.222933, 24.809138",25.0
1,2023-03-02,20989644.0,3,34,"Hiihtomäentie 36, 00800 Helsinki",3,Omistus,0.0,1.0,0.0,...,0,1956.0,5.0,26.0,0,0,60.203741,25.032671,"60.203741, 25.032671",24.0
2,2023-03-02,21125550.0,3,251,"Laivurinkatu 7 A, 00150 Helsinki",3,Omistus,0.0,3.0,0.0,...,0,1906.0,7.0,22.0,1,1,60.155891,24.942462,"60.155891, 24.942462",15.0
3,2023-03-02,67815.0,3,226,"Suvilahdenkatu 1 A, 00500 Helsinki",2,Omistus,0.0,3.0,0.0,...,0,1961.0,,,0,0,60.186996,24.966805,"60.186996, 24.966805",16.0
4,2023-03-02,70063245.0,2,236,"Jousenkaari 9 B, 02120 Espoo",3,Omistus,0.0,3.0,0.0,...,0,1959.0,,84.0,1,1,60.175354,24.788641,"60.175354, 24.788641",29.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3133 entries, 0 to 3238
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Paivamaara                  3133 non-null   object 
 1   Kohdenumero                 2703 non-null   float64
 2   Kaupunki                    3133 non-null   object 
 3   Kaupunginosa                3133 non-null   int64  
 4   Osoite                      3133 non-null   object 
 5   Kunto                       3097 non-null   object 
 6   Asumistyyppi                3133 non-null   object 
 7   Uudiskohde                  3013 non-null   float64
 8   Tontin_omistus              3052 non-null   float64
 9   Vuokratontin_lunastusosuus  3084 non-null   float64
 10  Velaton_hinta               3133 non-null   float64
 11  Yhtiovastike                3133 non-null   float64
 12  Hoitovastike                3133 non-null   float64
 13  Rahoitusvastike             3133 

None

Unnamed: 0,column_name,percentage_missing
0,Paivamaara,0.0
1,Longitude,0.0
2,Latitude,0.0
3,Tehdyt_remontit,0.0
4,Tulevat_remontit,0.0
5,Ranta,0.0
6,Sauna,0.0
7,Parveke,0.0
8,Korkein_kerros,0.0
9,Huoneita,0.0


In [39]:
final_df = pd.DataFrame()

final_df['date']                      = df['Paivamaara']
final_df['id']                        = df['Kohdenumero']
final_df['city']                      = df['Kaupunki']
final_df['district']                  = df['Kaupunginosa']
final_df['address']                   = df['Osoite']
final_df['coordinates']               = df['Coordinates']
final_df['travel_time']               = df['TravelTime']
final_df['condition']                 = df['Kunto']
final_df['new_building']              = df['Uudiskohde']
final_df['price']                     = df['Velaton_hinta']
final_df['condominium_fee']           = df['Yhtiovastike']
final_df['condominium_operation_fee'] = df['Hoitovastike']
final_df['condominium_financial_fee'] = df['Rahoitusvastike']
final_df['condominium_land_rent_fee'] = df['Tontin_vuokravastike']
final_df['area']                      = df['Asuinpinta_ala']
final_df['rooms']                     = df['Huoneita']
final_df['floor']                     = df['Kerros']
final_df['top_floor']                 = df['Korkein_kerros']
final_df['balcony']                   = df['Parveke']
final_df['sauna']                     = df['Sauna']
final_df['beach']                     = df['Ranta']
final_df['building_year']             = df['Rakennusvuosi']
final_df['apartments_in_building']    = df['Huoneistojen_lukumaara']
final_df['lvi_upcoming']              = df['Tulevat_remontit']
final_df['lvi_past']                  = df['Tehdyt_remontit']




final_df.to_csv('processed_housing_data.csv', index=False)