In [11]:
# Script by Lars Kjær

##################### Import libraries ################################

import pandas as pd
import os
import re


# The function is a build to preprocess raw data. 
# It can iterate a list of cvs files from a folder named 'raw_data'
# The function takes a file,scrub the data, and saves it in a new csv file.
# The new csv files is store in a folder name 'clean_data'.

def data_scrub(file):
      
    ########################## Get raw data ###########################
    
    file_name = file # store file name
    df = pd.read_csv(file_name) # read data


    ############### Get municipality code from file name ################
    # A municipality code is 'hidden' in the file name, so the text string is manipulated to extract the code
    municipality_code = file_name.split('_') # split string and return list
    # add the municipality code to the dataframe. It is the forth item of the list ( minus '.csv' ). 
    df['municipality'] = municipality_code[3].replace('.csv', '')


    ##################### Scrub data ################################### 
    
    def get_housing_type(row):# Get info on the type od housing, for example 'Ejerlejlighed'
        return row.split()[0] 
    df['housing_type'] = df['Boligtype & Adresse'].apply(lambda x : get_housing_type(x))

    def get_adr(row): # Get the adresses, for example 'Egebo 16'
        split_row = row.split()
        new_str = ' '.join(split_row[2:])
        res = re.split('\d{4}', new_str)[0].strip()
        return res
    df['address'] = df['Boligtype & Adresse'].apply(lambda x : get_adr(x))

    def get_zip(row): # Get the zip code, for example '2620'
        return re.search('\d{4}', row).group() 
    df['zip_code'] = df['Boligtype & Adresse'].apply(lambda x : get_zip(x))

    def get_town(row): # Get the name of the town , for example 'Albertslund' 
        return re.split('\d{4}', row)[1]
    df['town'] = df['Boligtype & Adresse'].apply(lambda x : get_town(x))

    def get_purchase_amount(row): # Get the cost of the purchase, for example 1500000
        return row.replace('.', '').replace('kr', '')
    df['purchase_amount'] = df['Købesum'].apply(lambda x : get_purchase_amount(x))

    def get_date(row): # Get the date of the sale, for example 2022-12-21
        return re.search('[\d]{1,2}?-[\d]{1,2}?-[\d]{4}', row).group()
    df['date_of_sale'] = df['Dato & Type'].apply(lambda x : get_date(x))

    def get_type_of_sale(row): # Get type of the sale, for example 'Alm. salg'
        return re.sub('\d', '', row).replace('-','').strip()
    df['type_of_sale'] = df['Dato & Type'].apply(lambda x : get_type_of_sale(x))

    def get_square_meter(row): # Get the number of square meters, for examle 114
        return re.split('m²', row)[0]
    df['square meters'] = df['m² & Kr. / m²'].apply(lambda x : get_square_meter(x)) 

    def get_square_meter_price(row): # Get the prices for a square meters, for examle 13158
        new_str = re.split('m²', row)[1].replace('.', '')
        return new_str.split()[0]
    df['square meters_price'] = df['m² & Kr. / m²'].apply(lambda x : get_square_meter_price(x))

    df['rooms'] = df['Vær.'] # Get the number of rooms, for example 5

    df['year_of_construction'] = df['Byggeår'] # Get the year of construction 2008

    # Get the percentage difference between the latest bid price and the sale price, for example '-54%'
    df['The percentage difference between the latest bid price and the sale price'] = df['Den procentuelle forskel mellem seneste udbudspris og salgsprisen %']    

    
    ################ build new dataframe ###########################################
    
    new_df = df.iloc[:,8:]

    
    ################## Change data types ##########################################
    
    new_df['zip_code'] = new_df['zip_code'].astype(int)
    new_df['purchase_amount'] = new_df['purchase_amount'].astype(int)
    new_df['square meters'] = new_df['square meters'].astype(int)
    new_df['square meters_price'] = new_df['square meters_price'].astype(int)
    new_df['date_of_sale'] = pd.to_datetime(new_df['date_of_sale'], dayfirst=True)

    
    ################## Save as csv #################################################
    
    os.chdir(destination)
    new_file_name = 'processed_data_' + file_name 
    new_df.to_csv(new_file_name, index=False, encoding='utf-8')
    
    
    ################ Output dataframe #############################################
    
    return new_df

In [12]:
# Scrub the raw data and save it in new files

# Raw data folder
departure = r'C:\Users\lakj\Documents\GitHub\isds2023\Lars\raw_data'

# Clean data folder
destination = r'C:\Users\lakj\Documents\GitHub\isds2023\Lars\clean_data'

# Move to raw data folder
os.chdir(departure)


data = []
files = os.listdir() # Build list of file names
data_frames = [] # Prepare empty list for later use
for file in files: # iterate list of file names
    new_data =  data_scrub(file) # apply function
    os.chdir(departure) # Change back to the folder with the raw data
    data.append(new_data)