# Preprocess data - scrubbing raw data
This code take the raw data, scrubbing it and returns a nice tidy format. 

Script by Lars Kjær

In [13]:
##################### Import libraries ################################

import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
import tqdm

# The function is a build to preprocess raw data. 
# It can iterate a list of cvs files from a folder named 'raw_data'
# The function takes a file,scrub the data, and saves it in a new csv file.
# The new csv files is store in a folder name 'clean_data'.

def data_scrub(file):
      
    ########################## Get raw data ###########################
    
    file_name = file # store file name
    df = pd.read_csv(f'{input_fp}/{file_name}', low_memory=False) # read data


    ############### Get municipality code from file name ################
    
    # A municipality code is 'hidden' in the file name, so the text string is manipulated to extract the code
    municipality_code = file_name.split('_') # split string and return list
    # add the municipality code to the dataframe. It is the forth item of the list ( minus '.csv' ). 
    df['municipality'] = municipality_code[3].replace('.csv', '')


    ##################### Scrub data ################################### 
    
    def get_housing_type(row):# Get info on the type od housing, for example 'Ejerlejlighed'
        return row.split()[0] 
    df['housing_type'] = df['Boligtype & Adresse'].apply(lambda x : get_housing_type(x))

    def get_adr(row): # Get the adresses, for example 'Egebo 16'
        split_row = row.split()
        new_str = ' '.join(split_row[2:])
        res = re.split('\d{4}', new_str)[0].strip()
        return res
    df['address'] = df['Boligtype & Adresse'].apply(lambda x : get_adr(x))

    def get_zip(row): # Get the zip code, for example '2620'
        return re.search('\d{4}', row).group() 
    df['zip_code'] = df['Boligtype & Adresse'].apply(lambda x : get_zip(x))

    def get_town(row): # Get the name of the town , for example 'Albertslund' 
        return re.split('\d{4}', row)[1]
    df['town'] = df['Boligtype & Adresse'].apply(lambda x : get_town(x))

    def get_purchase_amount(row): # Get the cost of the purchase, for example 1500000
        return row.replace('.', '').replace('kr', '')
    df['purchase_amount'] = df['Købesum'].apply(lambda x : get_purchase_amount(x))

    def get_date(row): # Get the date of the sale, for example 2022-12-21
        return re.search('[\d]{1,2}?-[\d]{1,2}?-[\d]{4}', row).group()
    df['date_of_sale'] = df['Dato & Type'].apply(lambda x : get_date(x))

    def get_type_of_sale(row): # Get type of the sale, for example 'Alm. salg'
        return re.sub('\d', '', row).replace('-','').strip()
    df['type_of_sale'] = df['Dato & Type'].apply(lambda x : get_type_of_sale(x))

    def get_square_meter(row): # Get the number of square meters, for examle 114
        return re.split('m²', row)[0]
    df['square_meters'] = df['m² & Kr. / m²'].apply(lambda x : get_square_meter(x)) 

    def get_square_meter_price(row): # Get the prices for a square meters, for examle 13158
        strings = re.split('m²', row)[1].replace('.', '')
        strings = ''.join(re.findall('\d', strings))
        try:
            return int(strings)
        except ValueError:
            return np.NaN
    df['square_meters_price'] = df['m² & Kr. / m²'].apply(lambda x : get_square_meter_price(x))

    df['rooms'] = df['Vær.'] # Get the number of rooms, for example 5

    df['year_of_construction'] = df['Byggeår'] # Get the year of construction 2008

    # Get the percentage difference between the latest bid price and the sale price, for example '-54%'
    df['The percentage difference between the latest bid price and the sale price'] = df['Den procentuelle forskel mellem seneste udbudspris og salgsprisen %']    

    
    ################ build new dataframe ###########################################
    
    new_df = df.iloc[:,8:]

    
    ################ Output dataframe #############################################
    
    return new_df

In [14]:
input_fp = Path.cwd() / 'data' # path of files to be found
output_fp = Path.cwd() / 'tidy_data' # output path of files generated
# Use the Path object to actually create the subfolder
Path.mkdir(output_fp, exist_ok=True)

for filename in tqdm.tqdm(sorted(os.listdir(input_fp))):
    if filename.startswith('sales_1992_2022_') and filename.endswith('.csv'):
        new_data =  data_scrub(filename) # apply function
        output_filename = 'tidy_' + filename
        new_data.to_csv(f'{output_fp}/{output_filename}', index=False)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:18<00:00,  1.27it/s]
