# Gather the data needed to train the model

In this notebook we combine all of the data from
step 2. The contents of this notebook is mirrored
in `utils.py` so that it can be used in other notebooks.

In [1]:
%load_ext jupyter_black

In [2]:
from pathlib import Path
import os

import numpy as np
import pandas as pd

In [3]:
input_dir = Path(os.getenv("STORM_DATA_DIR")) / "analysis/02_new_model_input"
output_dir = (
    Path(os.getenv("STORM_DATA_DIR")) / "analysis/03_new_model_training"
)

## Read in buliding damage

In [4]:
# Read in the building damage data
filename = input_dir / "02_housing_damage/output/building_damage_bygrid.csv"

df_damage = pd.read_csv(filename)
df_damage.columns

Index(['id', 'Centroid', 'typhoon', 'Year', 'weight', 'numbuildings_bygrid',
       'Totally_Damaged_bygrid', 'Totally_Damaged_Perc_bygrid'],
      dtype='object')

In [5]:
# Select and rename columns,
# drop any rows that don't have a typhoon name
columns_to_keep = {
    "id": "grid_point_id",
    "numbuildings_bygrid": "total_buildings",
    "typhoon": "typhoon_name",
    "Year": "typhoon_year",
    "Totally_Damaged_bygrid": "total_buildings_damaged",
}

df_damage = (
    df_damage.dropna(subset="typhoon")
    .loc[:, list(columns_to_keep.keys())]
    .rename(columns=columns_to_keep)
)
df_damage["typhoon_name"] = df_damage["typhoon_name"].str.upper()
df_damage["typhoon_year"] = df_damage["typhoon_year"].astype(int)

df_damage

Unnamed: 0,grid_point_id,total_buildings,typhoon_name,typhoon_year,total_buildings_damaged
0,6301.0,379.0,LINFA,2015,0.000000
1,6302.0,2.0,LINFA,2015,0.000000
2,6466.0,38.0,LINFA,2015,0.000000
3,6467.0,79.0,LINFA,2015,0.000000
4,6468.0,1.0,LINFA,2015,0.000000
...,...,...,...,...,...
12223,20680.0,7.0,HAIYAN,2013,0.053846
12224,20680.0,401.0,LINGLING,2014,27.021364
12225,20681.0,38.0,BOPHA,2012,7.794872
12226,20681.0,38.0,HAIYAN,2013,0.292308


## Read in windfield

In [6]:
# Read in the data file

filename = input_dir / "01_windfield/windfield_data.csv"

df_windfield = pd.read_csv(filename)
df_windfield.columns

Index(['Unnamed: 0', 'typhoon_id', 'typhoon_name', 'typhoon_year',
       'grid_point_id', 'wind_speed', 'track_distance'],
      dtype='object')

In [7]:
# Select columns
columns_to_keep = [
    "typhoon_name",
    "typhoon_year",
    "grid_point_id",
    "wind_speed",
    "track_distance",
]
df_windfield = df_windfield.loc[:, columns_to_keep]
df_windfield

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance
0,DURIAN,2006,101,0.0,303.180555
1,DURIAN,2006,4475,0.0,638.027502
2,DURIAN,2006,4639,0.0,603.631997
3,DURIAN,2006,4640,0.0,614.675270
4,DURIAN,2006,4641,0.0,625.720905
...,...,...,...,...,...
145309,MOLAVE,2020,20677,0.0,644.575831
145310,MOLAVE,2020,20678,0.0,655.685233
145311,MOLAVE,2020,20679,0.0,666.794635
145312,MOLAVE,2020,20680,0.0,677.904037


## Merge the datasets

In [8]:
index = ["typhoon_name", "typhoon_year", "grid_point_id"]
object_list = [df_damage, df_windfield]

# df_all = pd.concat(
#    objs=[df.set_index(index) for df in object_list], axis=1, join="outer"
# )

# For now do a left join to the windfield, since it has the exact points we want
df_all = df_windfield.set_index(index).merge(
    df_damage.set_index(index), left_index=True, right_index=True, how="left"
)

df_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wind_speed,track_distance,total_buildings,total_buildings_damaged
typhoon_name,typhoon_year,grid_point_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DURIAN,2006,101,0.0,303.180555,,
DURIAN,2006,4475,0.0,638.027502,,
DURIAN,2006,4639,0.0,603.631997,,
DURIAN,2006,4640,0.0,614.675270,,
DURIAN,2006,4641,0.0,625.720905,,
...,...,...,...,...,...,...
MOLAVE,2020,20677,0.0,644.575831,,
MOLAVE,2020,20678,0.0,655.685233,,
MOLAVE,2020,20679,0.0,666.794635,,
MOLAVE,2020,20680,0.0,677.904037,,


In [9]:
# TODO: remove this once the building dataset is fixed
# Get the number of buildings associated with a gridpoint,
# and fill in the missing values
building_number_dict = (
    df_damage.loc[
        :,
        ["grid_point_id", "total_buildings"],
    ]
    .set_index("grid_point_id")["total_buildings"]
    .to_dict()
)

df_all["total_buildings"] = (
    df_all.reset_index()["grid_point_id"].map(building_number_dict).values
)
df_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wind_speed,track_distance,total_buildings,total_buildings_damaged
typhoon_name,typhoon_year,grid_point_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DURIAN,2006,101,0.0,303.180555,,
DURIAN,2006,4475,0.0,638.027502,,
DURIAN,2006,4639,0.0,603.631997,,
DURIAN,2006,4640,0.0,614.675270,,
DURIAN,2006,4641,0.0,625.720905,,
...,...,...,...,...,...,...
MOLAVE,2020,20677,0.0,644.575831,173.0,
MOLAVE,2020,20678,0.0,655.685233,44.0,
MOLAVE,2020,20679,0.0,666.794635,13.0,
MOLAVE,2020,20680,0.0,677.904037,401.0,


## Clean the dataset

In [10]:
df = df_all.fillna(0)

In [11]:
df_all.columns

Index(['wind_speed', 'track_distance', 'total_buildings',
       'total_buildings_damaged'],
      dtype='object')

In [12]:
# Assume that NAs are all 0s
df_all = df_all.fillna(0)
# Drop rows with 0 buildings
df_all = df_all[df_all["total_buildings"] != 0]

In [13]:
# TODO: Remove this if it's fixed in the data
# Create percentage damage column
# Check if total damaged buildings is greater than total buildings
too_few_buildings = (
    df_all["total_buildings"] < df_all["total_buildings_damaged"]
)
sum(too_few_buildings)

950

In [14]:
# TODO: Remove this if it's fixed in the data
# At the moment some cells have more damaged buildings than buildings,
# so adjust the maximum
df_all.loc[too_few_buildings, "total_buildings"] = df_all.loc[
    too_few_buildings, "total_buildings_damaged"
]

In [15]:
# Calculate percentage
df_all["percent_buildings_damaged"] = (
    df_all["total_buildings_damaged"] / df_all["total_buildings"] * 100
)
df_all = df_all.drop(columns="total_buildings_damaged")

In [16]:
df_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wind_speed,track_distance,total_buildings,percent_buildings_damaged
typhoon_name,typhoon_year,grid_point_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DURIAN,2006,6301,0.0,510.358125,379.0,0.0
DURIAN,2006,6302,0.0,521.466945,2.0,0.0
DURIAN,2006,6466,0.0,488.030454,38.0,0.0
DURIAN,2006,6467,0.0,499.140454,79.0,0.0
DURIAN,2006,6468,0.0,510.250454,1.0,0.0
...,...,...,...,...,...,...
MOLAVE,2020,20677,0.0,644.575831,173.0,0.0
MOLAVE,2020,20678,0.0,655.685233,44.0,0.0
MOLAVE,2020,20679,0.0,666.794635,13.0,0.0
MOLAVE,2020,20680,0.0,677.904037,401.0,0.0


## Write out dataset

In [17]:
df_all.reset_index().to_csv(
    output_dir / "new_model_training_dataset.csv", index=False
)