# 2. Clean Data
This script prepares the data for fuzzymatching. It does:
* Remove \_export_information
* Translate location (`Genève` to `Genf`)

In [34]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../../data/lib/')
import consts
import re

## Import and Remove `_export_information`

In [35]:
df_raw = pd.read_csv('../../data/3. transformation/0. list_all.csv', names = consts.fix_columns + [consts.column_export_information], skiprows=1)
df_raw = df_raw[consts.fix_columns]
df_data = df_raw.copy()

## Copy Zip from Location into `PLZ` and remove it in `location`
If you have an error here, you probably have addresses withouth a location. Fix that before!

In [36]:
#If there are any location empty, fix it
df_data[df_data['location'].isnull()]

Unnamed: 0,name,location,country,address,plz,uci,donations_grants,sponsorship,registration_fees,travel_accommodation,fees,related_expenses,total,type,source


In [37]:
#df_data = df_raw.copy() #REMOVE

def copy_plz(value):
    #Only check Strings
    if isinstance(value, float):
        return np.NaN
    
    r = re.search('\s?[\d]{4}\s?', value)
    if r:
        return r.group(0).strip()
    else:
        return np.NaN
#Copy into PLZ
df_data.loc[df_data.plz.isna(), 'plz'] = df_data.loc[df_data.plz.isna(), 'location'].apply(copy_plz)
df_data.loc[df_data.plz.isna(), 'plz'] = df_data.loc[df_data.plz.isna(), 'address'].apply(copy_plz)

#Clean
df_data.loc[df_data.plz.notna(), 'location'] = df_data['location'].str.replace('\s?[\d]{4}\s?', '', regex=True)
df_data.loc[df_data.plz.notna(), 'address'] = df_data['address'].str.replace(',?\s?[\d]{4}\s?', '', regex=True)

#Strip
df_data['location'] = df_data['location'].str.strip()
df_data['address'] = df_data['address'].str.strip()

## Remove Location in Address

In [38]:
df_data['address'] = df_data.address.astype(str)
df_data['address'] = df_data.apply(lambda row: row['address'].replace(row['location'], ''), axis=1)

## Translate Location to «amtlich» (as the administration writes it)
`Genf` changes to `Genève`. Consistent writing.  
Source `place_translations.csv`: `Amtliches Gemeindeverzeichnis der Schweiz` https://www.bfs.admin.ch/bfs/de/home/grundlagen/agvch.html

In [39]:
#df_copy = df_data.copy() #Muss Raus

In [40]:
#Copy dataset
#df_data = df_copy.copy() # MUSS RAUS

#Import places
df_places = pd.read_csv('sources/place_translations.csv', sep=';')

def translate_line(df_data, lng, translation):
    if isinstance(lng, str) and lng != "":
        df_data.loc[df_data.location.str.lower() == lng.lower(), 'location'] = translation
    return df_data
        
for index, row in df_places.iterrows():
    df_data = translate_line(df_data, row['de'], row['amtlich'])
    df_data = translate_line(df_data, row['fr'], row['amtlich'])
    df_data = translate_line(df_data, row['it'], row['amtlich'])


## Remove `-` in address

In [41]:
df_data.loc[df_data.address == '-', 'address'] = np.NaN

## Remove Comma at the end

In [42]:
df_data = df_data.replace(",\s$", '', regex=True)
df_data = df_data.replace(",$", '', regex=True)

## Save!

In [43]:
df_data.to_csv('../../data/3. transformation/1. list_all_cleaned.csv', index=False)