# 2. Clean Data
This script prepares the data for fuzzymatching. It does:
* Remove \_export_information
* Translate location (`Genève` to `Genf`)

In [19]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../../data/lib/')
import consts
import re

## Import

In [20]:
df_raw = pd.read_csv('../../data/3. transformation/1_list_all.csv')
df_data = df_raw.copy()

## Remove `_export_information`

In [21]:
df_data.drop(columns='_export_information', axis=1, inplace=True)

## Copy Zip from Location into `PLZ` and remove it in `location`
If you have an error here, you probably have addresses withouth a location. Fix that before!

In [22]:
#df_data = df_raw.copy() #REMOVE
#df_data = df_raw[(df_raw.year == 2017) & (df_raw.source == 'allergan')]

def copy_plz(value):
    #Only check Strings
    if isinstance(value, float):
        return np.NaN
    
    r = re.search('\s?[\d]{4}\s?', value)
    if r:
        return r.group(0).strip()
    else:
        return np.NaN
#Copy into PLZ
df_data.loc[df_data.plz.isna(), 'plz'] = df_data.loc[df_data.plz.isna(), 'location'].apply(copy_plz)
df_data.loc[df_data.plz.isna(), 'plz'] = df_data.loc[df_data.plz.isna(), 'address'].apply(copy_plz)

#Clean
df_data.loc[df_data.plz.notna(), 'location'] = df_data['location'].str.replace('\s?[\d]{4}\s?', '', regex=True)
df_data.loc[df_data.plz.notna(), 'address'] = df_data['address'].str.replace('[,-]?\s?[\d]{4}.*', '', regex=True)

#Strip
df_data['location'] = df_data['location'].str.strip()
df_data['address'] = df_data['address'].str.strip()

## Remove Location in Address
Removed - cause of cases like:
`Kantonsspital Zürich`

In [23]:
df_data['address'] = df_data['address'].fillna("")
df_data['address'] = df_data.address.astype(str)
#df_data['address'] = df_data.apply(lambda row: row['address'].replace(row['location'], ''), axis=1)

## Translate Location to «amtlich» (as the administration writes it)
`Genf` changes to `Genève`. Consistent writing.  
Source `place_translations.csv`: `Amtliches Gemeindeverzeichnis der Schweiz` https://www.bfs.admin.ch/bfs/de/home/grundlagen/agvch.html

In [24]:
#Copy dataset
#df_data = df_copy.copy() # MUSS RAUS

#Import places
df_places = pd.read_csv('sources/place_translations.csv', sep=';')

def translate_line(df_data, lng, translation):
    if isinstance(lng, str) and lng != "":
        df_data.loc[df_data.location.str.lower() == lng.lower(), 'location'] = translation
    return df_data
        
for index, row in df_places.iterrows():
    df_data = translate_line(df_data, row['de'], row['amtlich'])
    df_data = translate_line(df_data, row['fr'], row['amtlich'])
    df_data = translate_line(df_data, row['it'], row['amtlich'])


## Remove `-` in address

In [25]:
df_data.loc[df_data.address == '-', 'address'] = np.NaN
df_data.loc[df_data.address == '--', 'address'] = np.NaN

## Replace Hyphen (Viertelgeviertstrich) with minus

In [26]:
df_data = df_data.replace('‐', '-', regex=True)

## Remove `Unknown`

In [27]:
df_data = df_data.replace('Unknown / relocated', np.nan, regex=True)

## Replace NAN

In [28]:
df_data = df_data.replace('NAN', np.nan, regex=True)

## Replace `str.` with `strasse`
Not needed anymore because of libPostal Expand

In [29]:
#df_data = df_data.replace('str\.', 'strasse', regex=True)

## Replace `stras se` with `strasse`
Otherwise, `stras se` will be expanded to `stras european company`

In [30]:
df_data = df_data.replace('stras se', 'strasse', regex=True)
df_data = df_data.replace('stra sse', 'strasse', regex=True)

## Remove Comma at the end

In [31]:
df_data = df_data.replace(",\s$", '', regex=True)
df_data = df_data.replace(",$", '', regex=True)

## Replace [space],

In [35]:
df_data = df_data.replace(" ,", ',', regex=True)

## Trim Strings

In [36]:
df_data['name'] = df_data['name'].str.strip()
df_data['location'] = df_data['location'].str.strip()
df_data['address'] = df_data['address'].str.strip()

## Save!

In [37]:
df_data.to_csv('../../data/3. transformation/2_list_all_cleaned.csv', index=False)