# Gather the data needed to train the model

In this notebook we combine all of the data from
step 2. The contents of this notebook is mirrored
in `utils.py` so that it can be used in other notebooks.

In [1]:
%load_ext jupyter_black

In [2]:
from pathlib import Path
import os

import numpy as np
import pandas as pd

In [3]:
input_dir = Path(os.getenv("STORM_DATA_DIR")) / "analysis/02_new_model_input"
output_dir = (
    Path(os.getenv("STORM_DATA_DIR")) / "analysis/03_new_model_training"
)

## Read in number of houses

In [4]:
# Read in the building damage data
filename = (
    input_dir / "02_housing_damage/output/transformed_housingunits_bygrid.csv"
)

df_houses = pd.read_csv(filename)
df_houses.columns

Index(['id', 'Centroid', 'numbuildings', 'Number of Household', 'hu_bygrid'], dtype='object')

In [5]:
# Select and rename columns,
columns_to_keep = {
    "id": "grid_point_id",
    "hu_bygrid": "total_houses",
}

df_houses = df_houses.loc[:, list(columns_to_keep.keys())].rename(
    columns=columns_to_keep
)
df_houses

Unnamed: 0,grid_point_id,total_houses
0,101,31.000000
1,4475,3.301020
2,4639,12.103741
3,4640,645.899660
4,4641,1071.731293
...,...,...
3721,20677,4449.357133
3722,20678,1521.435795
3723,20679,930.647069
3724,20680,1800.666044


## Read in buliding damage

In [6]:
# Read in the building damage data
filename = (
    input_dir / "02_housing_damage/output/building_damage_bygrid_gglfpdata.csv"
)

df_damage = pd.read_csv(filename)
df_damage.columns

Index(['id', 'Centroid', 'typhoon', 'Year', 'Totally', 'Partially', 'total',
       'numbuildings_x', 'numbuildings', 'weight', 'damaged_bygrid'],
      dtype='object')

In [7]:
# Select and rename columns,
# drop any rows that don't have a typhoon name
columns_to_keep = {
    "id": "grid_point_id",
    "typhoon": "typhoon_name",
    "Year": "typhoon_year",
    "damaged_bygrid": "total_houses_damaged",
}

df_damage = (
    df_damage.dropna(subset="typhoon")
    .loc[:, list(columns_to_keep.keys())]
    .rename(columns=columns_to_keep)
)
df_damage["typhoon_name"] = df_damage["typhoon_name"].str.upper()
for column_name in ["typhoon_year", "grid_point_id"]:
    df_damage[column_name] = df_damage[column_name].astype(int)

df_damage

Unnamed: 0,grid_point_id,typhoon_name,typhoon_year,total_houses_damaged
0,6133,LINFA,2015,0.000000
1,6134,LINFA,2015,0.000000
2,6299,LINFA,2015,0.000000
3,6300,LINFA,2015,0.000000
4,6301,LINFA,2015,0.000000
...,...,...,...,...
14123,20680,HAIYAN,2013,0.022594
14124,20680,LINGLING,2014,11.239387
14125,20681,BOPHA,2012,3.133054
14126,20681,HAIYAN,2013,0.117490


## Read in windfield

In [8]:
# Read in the data file

filename = input_dir / "01_windfield/windfield_data.csv"

df_windfield = pd.read_csv(filename)
df_windfield.columns

Index(['Unnamed: 0', 'typhoon_id', 'typhoon_name', 'typhoon_year',
       'grid_point_id', 'wind_speed', 'track_distance'],
      dtype='object')

In [9]:
# Select columns
columns_to_keep = [
    "typhoon_name",
    "typhoon_year",
    "grid_point_id",
    "wind_speed",
    "track_distance",
]
df_windfield = df_windfield.loc[:, columns_to_keep]
df_windfield

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance
0,DURIAN,2006,101,0.0,303.180555
1,DURIAN,2006,4475,0.0,638.027502
2,DURIAN,2006,4639,0.0,603.631997
3,DURIAN,2006,4640,0.0,614.675270
4,DURIAN,2006,4641,0.0,625.720905
...,...,...,...,...,...
145309,MOLAVE,2020,20677,0.0,644.575831
145310,MOLAVE,2020,20678,0.0,655.685233
145311,MOLAVE,2020,20679,0.0,666.794635
145312,MOLAVE,2020,20680,0.0,677.904037


## Read in rainfall

In [10]:
filename = input_dir / "03_rainfall/output/rainfall_data_mean.csv"
df_rainfall = pd.read_csv(filename)
df_rainfall[["typhoon_name", "typhoon_year"]] = df_rainfall[
    "typhoon"
].str.split("(\d+)", expand=True)[[0, 1]]
df_rainfall["typhoon_name"] = df_rainfall["typhoon_name"].str.upper()
df_rainfall["typhoon_year"] = df_rainfall["typhoon_year"].astype(int)
df_rainfall = df_rainfall.rename(columns={"id": "grid_point_id"}).loc[
    :,
    [
        "typhoon_name",
        "typhoon_year",
        "grid_point_id",
        "rainfall_max_6h",
        "rainfall_max_24h",
    ],
]
df_rainfall

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,rainfall_max_6h,rainfall_max_24h
0,DURIAN,2006,101,0.122917,0.085417
1,DURIAN,2006,4475,0.091667,0.027083
2,DURIAN,2006,4639,0.535417,0.146354
3,DURIAN,2006,4640,0.356250,0.101562
4,DURIAN,2006,4641,0.202083,0.057812
...,...,...,...,...,...
219829,NOUL,2015,20677,0.793750,0.350000
219830,NOUL,2015,20678,0.779167,0.375000
219831,NOUL,2015,20679,1.383333,0.931944
219832,NOUL,2015,20680,2.397917,1.608333


## Read in the vulnerability

In [11]:
# Read in the building damage data
filename = input_dir / "05_vulnerablility/output/phl_rwi_bygrid.csv"

df_rwi = (
    pd.read_csv(filename)
    .rename(columns={"id": "grid_point_id"})
    .drop(columns=["Centroid"])
)
df_rwi.columns

Index(['grid_point_id', 'rwi'], dtype='object')

## Read in topography

In [12]:
# Read in the building damage data
filename = input_dir / "04_topography/output/topography_variables_bygrid.csv"

df_top = (
    pd.read_csv(filename)
    .rename(columns={"id": "grid_point_id"})
    .drop(columns=["Centroid"])
)
df_top.columns
df_top

Unnamed: 0,grid_point_id,mean_slope,std_slope,mean_tri,std_tri,mean_elev,coast_length,with_coast
0,101,1.018526,0.481382,6.438706,2.699781,5.762712,3445.709753,1
1,4475,1.579400,1.060468,9.694848,4.585088,12.799127,8602.645832,1
2,4639,0.551764,0.298116,3.985103,1.527495,8.833333,5084.012925,1
3,4640,2.107949,2.638290,11.792592,11.677657,17.530431,55607.865950,1
4,4641,3.538881,3.981129,18.718779,17.074011,31.931338,35529.342507,1
...,...,...,...,...,...,...,...,...
3721,20677,3.790141,4.198243,20.390768,18.012771,36.304688,21559.003490,1
3722,20678,3.532580,3.041204,18.949623,13.163042,65.687266,12591.742022,1
3723,20679,4.444498,2.646619,22.290623,10.901755,37.414996,19740.596834,1
3724,20680,5.816195,4.268518,28.143405,17.917650,105.812452,26363.303778,1


## Read in urban / rural / pop

In [13]:
filename = input_dir / "06_settlement/output/ghs_rural_urban_pop.csv"
df_urban = (
    pd.read_csv(filename)
    .rename(columns={"id": "grid_point_id"})
    .drop(columns=["Centroid"])
)
df_urban

Unnamed: 0,grid_point_id,urban,rural,water,total_pop
0,101,0.000000,0.000000,1.000000,0.000000
1,4475,0.000000,0.024793,0.975207,0.000000
2,4639,0.000000,0.008264,0.991736,201.343014
3,4640,0.000000,0.338843,0.661157,6542.964245
4,4641,0.000000,0.793388,0.206612,13721.068653
...,...,...,...,...,...
3721,20677,0.074380,0.181818,0.743802,18985.240279
3722,20678,0.000000,0.479339,0.520661,5683.089689
3723,20679,0.000000,0.190083,0.809917,6707.771729
3724,20680,0.033058,0.297521,0.669421,13013.268260


## Merge the datasets

In [14]:
index = ["typhoon_name", "typhoon_year", "grid_point_id"]
object_list = [df_damage, df_rainfall]

# First merge all that are not the windfield, since
# windfield has all the gridpoints that we want
df_all = pd.concat(
    objs=[df.set_index(index) for df in object_list], axis=1, join="outer"
)

# For now do a left join to the windfield, since it has the exact points we want
df_all = df_windfield.set_index(index).merge(
    df_all, left_index=True, right_index=True, how="left"
)

# Finally, add the datasets that only have grid points, no associated typhoon
object_list = [df_houses, df_rwi, df_top, df_urban]
df_no_typhoon = pd.concat(
    objs=[df.set_index("grid_point_id") for df in object_list],
    axis=1,
    join="outer",
)

df_all = df_all.join(df_no_typhoon)
df_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wind_speed,track_distance,total_houses_damaged,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,mean_slope,std_slope,mean_tri,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop
typhoon_name,typhoon_year,grid_point_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
DURIAN,2006,101,0.0,303.180555,,0.122917,0.085417,31.000000,,1.018526,0.481382,6.438706,2.699781,5.762712,3445.709753,1,0.000000,0.000000,1.000000,0.000000
DURIAN,2006,4475,0.0,638.027502,,0.091667,0.027083,3.301020,-0.527000,1.579400,1.060468,9.694848,4.585088,12.799127,8602.645832,1,0.000000,0.024793,0.975207,0.000000
DURIAN,2006,4639,0.0,603.631997,,0.535417,0.146354,12.103741,-0.283000,0.551764,0.298116,3.985103,1.527495,8.833333,5084.012925,1,0.000000,0.008264,0.991736,201.343014
DURIAN,2006,4640,0.0,614.675270,,0.356250,0.101562,645.899660,-0.358889,2.107949,2.638290,11.792592,11.677657,17.530431,55607.865950,1,0.000000,0.338843,0.661157,6542.964245
DURIAN,2006,4641,0.0,625.720905,,0.202083,0.057812,1071.731293,-0.462800,3.538881,3.981129,18.718779,17.074011,31.931338,35529.342507,1,0.000000,0.793388,0.206612,13721.068653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MOLAVE,2020,20677,0.0,644.575831,,2.543750,0.778646,4449.357133,0.508167,3.790141,4.198243,20.390768,18.012771,36.304688,21559.003490,1,0.074380,0.181818,0.743802,18985.240279
MOLAVE,2020,20678,0.0,655.685233,,2.558333,0.861458,1521.435795,-0.174100,3.532580,3.041204,18.949623,13.163042,65.687266,12591.742022,1,0.000000,0.479339,0.520661,5683.089689
MOLAVE,2020,20679,0.0,666.794635,,2.975000,0.949479,930.647069,-0.244286,4.444498,2.646619,22.290623,10.901755,37.414996,19740.596834,1,0.000000,0.190083,0.809917,6707.771729
MOLAVE,2020,20680,0.0,677.904037,,2.889583,1.083333,1800.666044,0.038000,5.816195,4.268518,28.143405,17.917650,105.812452,26363.303778,1,0.033058,0.297521,0.669421,13013.268260


In [15]:
# TODO: remove this if the building dataset is fixed
# Get the number of buildings associated with a gridpoint,
# and fill in the missing values
building_number_dict = (
    df_houses.loc[
        :,
        ["grid_point_id", "total_houses"],
    ]
    .set_index("grid_point_id")["total_houses"]
    .to_dict()
)

df_all["total_houses"] = (
    df_all.reset_index()["grid_point_id"].map(building_number_dict).values
)
df_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wind_speed,track_distance,total_houses_damaged,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,mean_slope,std_slope,mean_tri,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop
typhoon_name,typhoon_year,grid_point_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
DURIAN,2006,101,0.0,303.180555,,0.122917,0.085417,31.000000,,1.018526,0.481382,6.438706,2.699781,5.762712,3445.709753,1,0.000000,0.000000,1.000000,0.000000
DURIAN,2006,4475,0.0,638.027502,,0.091667,0.027083,3.301020,-0.527000,1.579400,1.060468,9.694848,4.585088,12.799127,8602.645832,1,0.000000,0.024793,0.975207,0.000000
DURIAN,2006,4639,0.0,603.631997,,0.535417,0.146354,12.103741,-0.283000,0.551764,0.298116,3.985103,1.527495,8.833333,5084.012925,1,0.000000,0.008264,0.991736,201.343014
DURIAN,2006,4640,0.0,614.675270,,0.356250,0.101562,645.899660,-0.358889,2.107949,2.638290,11.792592,11.677657,17.530431,55607.865950,1,0.000000,0.338843,0.661157,6542.964245
DURIAN,2006,4641,0.0,625.720905,,0.202083,0.057812,1071.731293,-0.462800,3.538881,3.981129,18.718779,17.074011,31.931338,35529.342507,1,0.000000,0.793388,0.206612,13721.068653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MOLAVE,2020,20677,0.0,644.575831,,2.543750,0.778646,4449.357133,0.508167,3.790141,4.198243,20.390768,18.012771,36.304688,21559.003490,1,0.074380,0.181818,0.743802,18985.240279
MOLAVE,2020,20678,0.0,655.685233,,2.558333,0.861458,1521.435795,-0.174100,3.532580,3.041204,18.949623,13.163042,65.687266,12591.742022,1,0.000000,0.479339,0.520661,5683.089689
MOLAVE,2020,20679,0.0,666.794635,,2.975000,0.949479,930.647069,-0.244286,4.444498,2.646619,22.290623,10.901755,37.414996,19740.596834,1,0.000000,0.190083,0.809917,6707.771729
MOLAVE,2020,20680,0.0,677.904037,,2.889583,1.083333,1800.666044,0.038000,5.816195,4.268518,28.143405,17.917650,105.812452,26363.303778,1,0.033058,0.297521,0.669421,13013.268260


## Clean the dataset

In [16]:
df_all.columns.drop("rwi")

Index(['wind_speed', 'track_distance', 'total_houses_damaged',
       'rainfall_max_6h', 'rainfall_max_24h', 'total_houses', 'mean_slope',
       'std_slope', 'mean_tri', 'std_tri', 'mean_elev', 'coast_length',
       'with_coast', 'urban', 'rural', 'water', 'total_pop'],
      dtype='object')

In [17]:
# Assume that NAs are all 0s
columns_to_fillna = df_all.columns.drop("rwi")
df_all[columns_to_fillna] = df_all[columns_to_fillna].fillna(0)
# Drop rows with 0 buildings
df_all = df_all[df_all["total_houses"] != 0]

In [18]:
# TODO: Remove this if it's fixed in the data
# Create percentage damage column
# Check if total damaged buildings is greater than total buildings.
too_few_buildings = df_all["total_houses"] < df_all["total_houses_damaged"]
sum(too_few_buildings)

3

In [19]:
# Calculate percentage. Per the above, some percentages will be above 100
# but we wil leave it for now since it's all "relative".
df_all["percent_houses_damaged"] = (
    df_all["total_houses_damaged"] / df_all["total_houses"] * 100
)
df_all = df_all.drop(columns="total_houses_damaged")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all["percent_houses_damaged"] = (


In [20]:
df_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,mean_slope,std_slope,mean_tri,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged
typhoon_name,typhoon_year,grid_point_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
DURIAN,2006,101,0.0,303.180555,0.122917,0.085417,31.000000,,1.018526,0.481382,6.438706,2.699781,5.762712,3445.709753,1,0.000000,0.000000,1.000000,0.000000,0.0
DURIAN,2006,4475,0.0,638.027502,0.091667,0.027083,3.301020,-0.527000,1.579400,1.060468,9.694848,4.585088,12.799127,8602.645832,1,0.000000,0.024793,0.975207,0.000000,0.0
DURIAN,2006,4639,0.0,603.631997,0.535417,0.146354,12.103741,-0.283000,0.551764,0.298116,3.985103,1.527495,8.833333,5084.012925,1,0.000000,0.008264,0.991736,201.343014,0.0
DURIAN,2006,4640,0.0,614.675270,0.356250,0.101562,645.899660,-0.358889,2.107949,2.638290,11.792592,11.677657,17.530431,55607.865950,1,0.000000,0.338843,0.661157,6542.964245,0.0
DURIAN,2006,4641,0.0,625.720905,0.202083,0.057812,1071.731293,-0.462800,3.538881,3.981129,18.718779,17.074011,31.931338,35529.342507,1,0.000000,0.793388,0.206612,13721.068653,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MOLAVE,2020,20677,0.0,644.575831,2.543750,0.778646,4449.357133,0.508167,3.790141,4.198243,20.390768,18.012771,36.304688,21559.003490,1,0.074380,0.181818,0.743802,18985.240279,0.0
MOLAVE,2020,20678,0.0,655.685233,2.558333,0.861458,1521.435795,-0.174100,3.532580,3.041204,18.949623,13.163042,65.687266,12591.742022,1,0.000000,0.479339,0.520661,5683.089689,0.0
MOLAVE,2020,20679,0.0,666.794635,2.975000,0.949479,930.647069,-0.244286,4.444498,2.646619,22.290623,10.901755,37.414996,19740.596834,1,0.000000,0.190083,0.809917,6707.771729,0.0
MOLAVE,2020,20680,0.0,677.904037,2.889583,1.083333,1800.666044,0.038000,5.816195,4.268518,28.143405,17.917650,105.812452,26363.303778,1,0.033058,0.297521,0.669421,13013.268260,0.0


## Write out dataset

In [21]:
df_all.reset_index().to_csv(
    output_dir / "new_model_training_dataset.csv", index=False
)