# PREPROCESSING

This notebook processes the original data and extracts only the necessary information for further modeling.

## Libraries, Constants, Functions

In [None]:
from libraries import *
from constants import *
from functions import *

gmaps = googlemaps.Client(key = GOOGLE_API_KEY)

## Importing

In [None]:
# Importing original datasets

os.chdir(data_path + original_path)

for filename in os.listdir():
    if filename.endswith(".parquet.gzip"):
        df_name = filename.split('.')[0]
        globals()[df_name] = pd.read_parquet(filename)

In [None]:
# Importing the dataset with the polygons for the 15 Belgian cities.
# Keeping only selected cities (Antwerpen, Brugge, Brussels, Charleroi, Gent, Leuven, Liege, Oostende)

cities = gpd.read_file(data_path + belgium_polygons_path)

objectid_list = [1, 2, 3, 4, 5, 6, 8, 11]  
cities = cities[cities['OBJECTID'].isin(objectid_list)]

city_name_mapping = {
    'Bruxelles / Brussel (greater city)': 'Brussels',
    'Charleroi (greater city)': 'Charleroi',
    'Liège (greater city)': 'Liege'
}

cities['CityName'] = cities['CityName'].replace(city_name_mapping)

# Setting the CRS (Coordinate Reference System) to 4326
cities = cities.to_crs(epsg=4326)

## Cleaning original data

### AED locations

In [None]:
# public

aed_locations['public'] = aed_locations['public'].fillna("0")
aed_locations['public'] = aed_locations['public'].apply(lambda x: 
    1 if x.lower().startswith(('o', 'j', 'y')) else 0
)

In [None]:
# latitude, longitude

addresses = (
    aed_locations['address'].astype(str) + ", " +
    aed_locations['number'].astype(str) + ", " +
    aed_locations['postal_code'].astype(str) + ", " +
    aed_locations['municipality'].astype(str) + ", " +
    aed_locations['province'].astype(str)
)

num = len(addresses)
confirmation = input(f"This will initialize {num} API requests. Are you sure? (yes/no): ")
if confirmation == "yes":
    print("OK. Geocoding...")
    geocoded = addresses.apply(lambda x: gmaps.geocode(x))
    
    latitude = geocoded.apply(lambda x: x[0]['geometry']['location']['lat'] if x else None)
    longitude = geocoded.apply(lambda x: x[0]['geometry']['location']['lng'] if x else None)
    coordinates = pd.DataFrame({'latitude': latitude, 'longitude': longitude})

    aed_locations = pd.concat([aed_locations, coordinates], axis=1)
else:
    print("OK. All coordinates are set to 0.")
    aed_locations['latitude'] = 0
    aed_locations['longitude'] = 0

In [None]:
# city

aed_locations.rename(columns={'municipality': 'city'}, inplace=True)

In [None]:
aeds = aed_locations[['public', 'latitude', 'longitude', 'city']].copy()

### Interventions (Cards)

In [None]:
interventions = pd.concat([interventions1, interventions2, interventions3], ignore_index=True)
del interventions1, interventions2, interventions3

In [None]:
# Filtering only observations on cardiac events

cardiac_codes_string = '|'.join(cardiac_codes)

interventions = interventions[
    interventions['EventType Firstcall'].str.contains(cardiac_codes_string) |
    interventions['EventType Trip'].str.contains(cardiac_codes_string)
]

cad9['EventType Trip'] = cad9['EventType Trip'].fillna("unknown")
cad9 = cad9[cad9['EventType Trip'].str.contains(cardiac_codes_string)]

interventions_bxl = interventions_bxl[
    interventions_bxl['eventtype_firstcall'].str.contains(cardiac_codes_string) |
    interventions_bxl['eventtype_trip'].str.contains(cardiac_codes_string)
]

interventions_bxl2['EventType and EventLevel'] = interventions_bxl2['EventType and EventLevel'].fillna("unknown")
interventions_bxl2 = interventions_bxl2[
    interventions_bxl2['EventType and EventLevel'].str.contains(cardiac_codes_string)
]

In [None]:
# Selecting only columns are actually needed

selected_columns = ["Latitude intervention", "Longitude intervention", "CityName intervention"]
interventions = interventions[selected_columns]

cad9 = cad9[selected_columns]

selected_columns = ["latitude_intervention", "longitude_intervention", "cityname_intervention"]
interventions_bxl = interventions_bxl[selected_columns]

selected_columns = ["Latitude intervention", "Longitude intervention", "Cityname Intervention"]
interventions_bxl2 = interventions_bxl2[selected_columns]

In [None]:
# Naming them consistently, so we can merge them into one dataset

colnames = ["latitude", "longitude", "city"]

interventions.columns = colnames
cad9.columns = colnames
interventions_bxl.columns = colnames
interventions_bxl2.columns = colnames

cards = pd.concat([interventions, cad9, interventions_bxl, interventions_bxl2], ignore_index=True)

del interventions, cad9, interventions_bxl, interventions_bxl2, colnames, selected_columns

In [None]:
# Latitude, longitude

# 2 - Correct format of coordinates
cards2 = cards[
    (cards['latitude'] >= BELGIUM_SOUTH) & (cards['latitude'] <= BELGIUM_NORTH) &
    (cards['longitude'] >= BELGIUM_WEST) & (cards['longitude'] <= BELGIUM_EAST)
]

# 3 - Wrong format of coordinates (but no NAs)
cards3 = cards[
    (cards['latitude'] < BELGIUM_SOUTH) | (cards['latitude'] > BELGIUM_NORTH) |
    (cards['longitude'] < BELGIUM_WEST) | (cards['longitude'] > BELGIUM_EAST)
]
cards3 = cards3[~cards3['latitude'].isna() & ~cards3['longitude'].isna()]

# Fixing cards3
cards3['latitude'] = cards3['latitude'].apply(lambda x: x / 10 if 100 <= x < 1000 else x)
cards3['latitude'] = cards3['latitude'].apply(lambda x: insert_decimal(x, 2) if x >= 1000 else x)

cards3['longitude'] = cards3['longitude'].apply(lambda x: x / 10 if 10 <= x < 100 else (x / 100 if 100 <= x < 1000 else x))
cards3['longitude'] = cards3['longitude'].apply(lambda x: insert_decimal(x, 1) if x >= 1000 else x)

# Concatenate
cards = pd.concat([cards2, cards3])

# Filter outlying values
cards = cards[
    (cards['latitude'] >= BELGIUM_SOUTH) & (cards['latitude'] <= BELGIUM_NORTH) &
    (cards['longitude'] >= BELGIUM_WEST) & (cards['longitude'] <= BELGIUM_EAST)
]

cards['latitude'] = pd.to_numeric(cards['latitude'], errors='coerce')
cards['longitude'] = pd.to_numeric(cards['longitude'], errors='coerce')
cards = cards.drop_duplicates(subset=['latitude', 'longitude'], keep='last')

## Segmenting by city

In [None]:
os.chdir(data_path + clean_path)

for city_name, city_polygon in cities[['CityName', 'geometry']].values:
    print("Segmenting " + city_name + "...")
    # aeds
    city_aeds = filter_points_within_polygon(aeds, city_polygon)
    city_aeds.to_csv(f'{city_name}_aeds.csv', index=False)
    
    # cards - split into train and test sets
    city_cards = filter_points_within_polygon(cards, city_polygon)
    cards_train, cards_test = train_test_split(city_cards, test_size=TEST_SIZE, random_state=SEED)    
    cards_train.to_csv(f'{city_name}_cards_train.csv', index=False)
    cards_test.to_csv(f'{city_name}_cards_test.csv', index=False)

## Calculating all new possible AED locations

In [None]:
os.chdir(data_path + possible_locations_path)

for city_name, city_polygon in cities[['CityName', 'geometry']].values:
    print("Calculating possible AED locations for " + city_name + "...")
    
    streets = get_streets_within_polygon(city_polygon)
    points = sample_points_on_streets(streets, num_points = 3)
    
    possible_locations = gpd.GeoDataFrame(geometry = points, crs = streets.crs)
    possible_locations = possible_locations.sample(frac = SAMPLE_SIZE, random_state = SEED)
    possible_locations = remove_close_points(possible_locations, min_distance = MIN_DISTANCE)
    
    possible_locations.to_csv(f'{city_name}_possible_locations.csv', index=False)