In [2]:
import os
from os.path import join
import pandas as pd
from tqdm import tqdm
from geopy.geocoders import Nominatim
from statistics import mean

In [3]:
def get_locations(address, zip_code):
    try:
        # This removes information about a flats storey
        address_field = address.split(', ')[0]
        # This one removes trailing letters on the city name
        # It seems as if Openstreetmap cannot handle København H
        # but it works with København
        zip_field = ' '.join(zip_code.split(' ')[:-1])
        search_address = ', '.join([address_field, zip_field])

        geolocator = Nominatim()
        print('here')
        location = geolocator.geocode(search_address)
        print('after')
        return location.latitude, location.longitude
    except:
        print('Skipped geocoding of {} {}'.format(address, zip_code))
        return None, None

In [4]:
def remove_city_name(zip_code):
    return ' '.join(zip_code.split(' ')[:-2])

In [5]:
def create_data_frame():
    li = []
    for filename in os.listdir('./boliga_stats'):
        #filename = os.path.join(os.getcwd(),filename)
        df = pd.read_csv(os.path.join('./boliga_stats',filename), index_col=None, header=0)
        li.append(df)

    return pd.concat(li, axis=0, ignore_index=True)

In [6]:
def create_city_csv(dataframe, year):
    cities = {
              'Odense': '5000',
              'København': '1049',
              'Aarhus': '8000',
              'Aalborg': '9000'}
    folder_path = join(os.getcwd(),year)          
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
        
    for city in cities:
        mask = (dataframe['zip_code_num'] == cities[city])
        dataf = dataframe[mask]
        dataf.to_csv('./' + year + '/' + city + ".csv", index=False,encoding='utf-8')

In [7]:
df = pd.read_csv('./test.csv', index_col=None, header=0)

Converting the sell_date to date time.

In [8]:
df['sell_date'] = pd.to_datetime(df['sell_date'],format='%d-%m-%Y')

In [10]:
# Create a  new column which only contain the zip code.
df['zip_code_num'] = df.apply(lambda x: remove_city_name(x['zip_code']), axis=1)

In [12]:
#Generate the 2 masks we need. 
mask_1992 = ((df['sell_date'].dt.year == 1992) & 
     ((df['zip_code_num'] == '1050') | (df['zip_code_num'] == '1049') | (df['zip_code_num'] == '5000') | 
     (df['zip_code_num'] == '8000') | (df['zip_code_num'] == '9000'))
     )

mask_2016 = ((df['sell_date'].dt.year == 2016) & 
     ((df['zip_code_num'] == '1050') | (df['zip_code_num'] == '1049') | (['zip_code_num'] == '5000') | 
     (['zip_code_num'] == '8000') | (['zip_code_num'] == '9000'))
     )

In [13]:
#Calculate the average square meter prices.
print('Average price per square meter for the year 1992 is {} m\u00b2'.format(mean(df[mask_1992]['price_per_sq_m'])),)
print('Average price per square meter for the year 2016 is {} m\u00b2'.format(mean(df[mask_2016]['price_per_sq_m'])),)

Average price per square meter for the year 1992 is 45287.5 m²
Average price per square meter for the year 2016 is 1851 m²


In [16]:
# Create two folder one for each year containing a csv for each of the zip codes specified in the assignment.
create_city_csv(df[mask_1992], '1992')
create_city_csv(df[mask_2016], '2016')