# Gather the data needed to train the model

In this notebook we combine all of the data from
step 2. The contents of this notebook is mirrored
in `utils.py` so that it can be used in other notebooks.

In [115]:
%load_ext jupyter_black

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


In [116]:
from pathlib import Path
import os

import numpy as np
import pandas as pd

In [117]:
input_dir = Path(os.getenv("STORM_DATA_DIR")) / "analysis/02_new_model_input"
output_dir = (
    Path(os.getenv("STORM_DATA_DIR")) / "analysis/03_new_model_training"
)

## Read in buliding damage

In [118]:
# Read in the building damage data
filename = (
    input_dir
    / "02_housing_damage/output/percentage_building_damage_bygrid.csv"
)

df_damage = pd.read_csv(filename)
df_damage.columns

Index(['id', 'NUMPOINTS', 'left', 'top', 'right', 'bottom', 'Area', 'AreainKM',
       'Len', 'Longitude', 'Latitude', 'Centroid_x', 'Centroid_y',
       'ADM3_PCODE', 'ADM3_EN', 'Grid Completeness', 'Id', 'pcode', 'typhoon',
       'Year', 'Totally', 'Partially', 'total', 'Totally_Damaged_bygrid',
       'Partially_Damaged_bygrid', 'All_Damaged_bygrid',
       'Totally_Damaged_Perc_bygrid', 'Partially_Damaged_Perc_bygrid',
       'All_Damaged_Perc_bygrid'],
      dtype='object')

In [119]:
# Select and rename columns,
# drop any rows that don't have a typhoon name
columns_to_keep = {
    "id": "grid_point_id",
    "NUMPOINTS": "total_buildings",
    "typhoon": "typhoon_name",
    "Year": "typhoon_year",
    "Totally_Damaged_bygrid": "total_buildings_damaged",
}

df_damage = (
    df_damage.dropna(subset="typhoon")
    .loc[:, list(columns_to_keep.keys())]
    .rename(columns=columns_to_keep)
)
df_damage["typhoon_name"] = df_damage["typhoon_name"].str.upper()
df_damage["typhoon_year"] = df_damage["typhoon_year"].astype(int)

df_damage

Unnamed: 0,grid_point_id,total_buildings,typhoon_name,typhoon_year,total_buildings_damaged
6116,6097,0.0,LINFA,2015,0.000000
6120,6098,0.0,LINFA,2015,0.000000
6285,6262,57.0,LINFA,2015,0.000000
6287,6263,0.0,LINFA,2015,0.000000
6289,6264,379.0,LINFA,2015,0.000000
...,...,...,...,...,...
48370,20557,401.0,LINGLING,2014,0.179991
48371,20557,401.0,HAIYAN,2013,0.179991
48372,20558,38.0,BOPHA,2012,6.997777
48373,20558,38.0,LINGLING,2014,0.262417


In [120]:
# TODO: remove this step once damage data has been cleaned
index = ["typhoon_name", "typhoon_year", "grid_point_id"]
df_damage = df_damage.set_index(index)
df_damage = df_damage.loc[~df_damage.index.duplicated(keep="first")]
df_damage = df_damage.reset_index()
df_damage

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,total_buildings,total_buildings_damaged
0,LINFA,2015,6097,0.0,0.000000
1,LINFA,2015,6098,0.0,0.000000
2,LINFA,2015,6262,57.0,0.000000
3,LINFA,2015,6263,0.0,0.000000
4,LINFA,2015,6264,379.0,0.000000
...,...,...,...,...,...
14113,LINGLING,2014,20557,401.0,11.122416
14114,HAIYAN,2013,20557,401.0,0.179991
14115,BOPHA,2012,20558,38.0,6.997777
14116,LINGLING,2014,20558,38.0,0.262417


## Read in windfield

In [121]:
# Read in the data file

filename = input_dir / "01_windfield/windfield_data.csv"

df_windfield = pd.read_csv(filename)
df_windfield.columns

Index(['Unnamed: 0', 'typhoon_id', 'typhoon_name', 'typhoon_year',
       'wind_speed', 'grid_point_id'],
      dtype='object')

In [123]:
# Select columns
columns_to_keep = [
    "typhoon_name",
    "typhoon_year",
    "grid_point_id",
    "wind_speed",
]
df_windfield = df_windfield.loc[:, columns_to_keep]
df_windfield

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed
0,DURIAN,2006,101,0.0
1,DURIAN,2006,4449,0.0
2,DURIAN,2006,4612,0.0
3,DURIAN,2006,4613,0.0
4,DURIAN,2006,4614,0.0
...,...,...,...,...
145231,MOLAVE,2020,20554,0.0
145232,MOLAVE,2020,20555,0.0
145233,MOLAVE,2020,20556,0.0
145234,MOLAVE,2020,20557,0.0


## Merge the datasets

In [125]:
index = ["typhoon_name", "typhoon_year", "grid_point_id"]
object_list = [df_damage, df_windfield]

# df_all = pd.concat(
#    objs=[df.set_index(index) for df in object_list], axis=1, join="outer"
# )

# For now do a left join to the windfield, since it has the exact points we want
df_all = df_windfield.set_index(index).merge(
    df_damage.set_index(index), left_index=True, right_index=True, how="left"
)

df_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wind_speed,total_buildings,total_buildings_damaged
typhoon_name,typhoon_year,grid_point_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DURIAN,2006,101,0.0,,
DURIAN,2006,4449,0.0,,
DURIAN,2006,4612,0.0,,
DURIAN,2006,4613,0.0,,
DURIAN,2006,4614,0.0,,
...,...,...,...,...,...
MOLAVE,2020,20554,0.0,,
MOLAVE,2020,20555,0.0,,
MOLAVE,2020,20556,0.0,,
MOLAVE,2020,20557,0.0,,


## Clean the dataset

In [126]:
df_all.columns

Index(['wind_speed', 'total_buildings', 'total_buildings_damaged'], dtype='object')

In [127]:
# Assume that NAs are all 0s
df_all = df_all.fillna(0)

In [128]:
# TODO: Remove this if it's fixed in the data
# Create percentage damage column
# Check if total damaged buildings is greater than total buildings
too_few_buildings = (
    df_all["total_buildings"] < df_all["total_buildings_damaged"]
)
sum(too_few_buildings)

1826

In [129]:
# TODO: Remove this if it's fixed in the data
# At the moment some cells have more damaged buildings than buildings,
# so adjust the maximum
df_all.loc[too_few_buildings, "total_buildings"] = df_all.loc[
    too_few_buildings, "total_buildings_damaged"
]

In [130]:
# Calculate percentage
# Set NAs to 0, this happens when both values are 0
df_all["percent_buildings_damaged"] = (
    df_all["total_buildings_damaged"] / df_all["total_buildings"] * 100
).fillna(0)

In [131]:
df_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wind_speed,total_buildings,total_buildings_damaged,percent_buildings_damaged
typhoon_name,typhoon_year,grid_point_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DURIAN,2006,101,0.0,0.0,0.0,0.0
DURIAN,2006,4449,0.0,0.0,0.0,0.0
DURIAN,2006,4612,0.0,0.0,0.0,0.0
DURIAN,2006,4613,0.0,0.0,0.0,0.0
DURIAN,2006,4614,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
MOLAVE,2020,20554,0.0,0.0,0.0,0.0
MOLAVE,2020,20555,0.0,0.0,0.0,0.0
MOLAVE,2020,20556,0.0,0.0,0.0,0.0
MOLAVE,2020,20557,0.0,0.0,0.0,0.0


## Write out dataset

In [132]:
df_all.reset_index().to_csv(
    output_dir / "new_model_training_dataset.csv", index=False
)