# Create a historical variable

In Historical Risk Model, the dataset is being updated with the addition of a new variable called "percent_houses_damaged_5years". 

This variable will be incorporated into the existing dataset. For every data point or location, an average of damages caused by typhoons over the past 5 years will be calculated and recorded as the value for this new variable.

In [1]:
%load_ext jupyter_black

In [2]:
import statistics
import numpy as np
import pandas as pd

from utils import get_training_dataset_primary

In [3]:
# Read csv file and import to df
df = get_training_dataset_primary()
df.head()

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,mean_slope,...,mean_tri,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged
0,DURIAN,2006,101,0.0,303.180555,0.122917,0.085417,31.0,,1.018526,...,6.438706,2.699781,5.762712,3445.709753,1,0.0,0.0,1.0,0.0,0.0
1,DURIAN,2006,4475,0.0,638.027502,0.091667,0.027083,3.30102,-0.527,1.5794,...,9.694848,4.585088,12.799127,8602.645832,1,0.0,0.0,1.0,0.0,0.0
2,DURIAN,2006,4639,0.0,603.631997,0.535417,0.146354,12.103741,-0.283,0.551764,...,3.985103,1.527495,8.833333,5084.012925,1,0.0,0.01,0.99,197.339034,0.0
3,DURIAN,2006,4640,0.0,614.67527,0.35625,0.101562,645.89966,-0.358889,2.107949,...,11.792592,11.677657,17.530431,55607.86595,1,0.0,0.31,0.69,4970.477311,0.0
4,DURIAN,2006,4641,0.0,625.720905,0.202083,0.057812,1071.731293,-0.4628,3.538881,...,18.718779,17.074011,31.931338,35529.342507,1,0.0,0.77,0.23,12408.594656,0.0


In [4]:
# Groupby df based on "typhoon_year" and "grid_point_id" and "percent_houses_damaged"
df_avgDmgCell_and_Year = df.groupby(["typhoon_year", "grid_point_id"], as_index=False)[
    "percent_houses_damaged"
].mean()
df_avgDmgCell_and_Year

Unnamed: 0,typhoon_year,grid_point_id,percent_houses_damaged
0,2006,101,0.0
1,2006,4475,0.0
2,2006,4639,0.0
3,2006,4640,0.0
4,2006,4641,0.0
...,...,...,...
47081,2020,20677,0.0
47082,2020,20678,0.0
47083,2020,20679,0.0
47084,2020,20680,0.0


In [6]:
# Calculate the average damaged of past 5 years for each point
df_res2 = (
    df_avgDmgCell_and_Year.groupby("grid_point_id")
    .rolling(5, min_periods=1)
    .agg({"percent_houses_damaged": "mean", "typhoon_year": "max"})
)

df_res2 = df_res2.rename(
    columns={"percent_houses_damaged": "percent_houses_damaged_5years"}
)

In [7]:
df_res2["typhoon_year"] = df_res2["typhoon_year"] + 1
df_res2

Unnamed: 0_level_0,Unnamed: 1_level_0,percent_houses_damaged_5years,typhoon_year
grid_point_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101,0,0.000000,2007.0
101,3622,0.000000,2009.0
101,7244,0.000000,2010.0
101,10866,0.000000,2011.0
101,14488,0.000000,2012.0
...,...,...,...
20681,32597,0.170235,2016.0
20681,36219,0.170235,2017.0
20681,39841,0.002309,2019.0
20681,43463,0.001050,2020.0


In [8]:
# Join this new variable to the main df wrt "typhoon_year" and "grid_point_id"
df2 = df.merge(df_res2, on=["typhoon_year", "grid_point_id"], how="left")
df2["percent_houses_damaged_5years"] = df2["percent_houses_damaged_5years"].fillna(0)

df2

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,mean_slope,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged,percent_houses_damaged_5years
0,DURIAN,2006,101,0.0,303.180555,0.122917,0.085417,31.000000,,1.018526,...,2.699781,5.762712,3445.709753,1,0.00,0.000000,1.000000,0.000000,0.0,0.000000
1,DURIAN,2006,4475,0.0,638.027502,0.091667,0.027083,3.301020,-0.527000,1.579400,...,4.585088,12.799127,8602.645832,1,0.00,0.000000,1.000000,0.000000,0.0,0.000000
2,DURIAN,2006,4639,0.0,603.631997,0.535417,0.146354,12.103741,-0.283000,0.551764,...,1.527495,8.833333,5084.012925,1,0.00,0.010000,0.990000,197.339034,0.0,0.000000
3,DURIAN,2006,4640,0.0,614.675270,0.356250,0.101562,645.899660,-0.358889,2.107949,...,11.677657,17.530431,55607.865950,1,0.00,0.310000,0.690000,4970.477311,0.0,0.000000
4,DURIAN,2006,4641,0.0,625.720905,0.202083,0.057812,1071.731293,-0.462800,3.538881,...,17.074011,31.931338,35529.342507,1,0.00,0.770000,0.230000,12408.594656,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141253,MOLAVE,2020,20677,0.0,644.575831,2.543750,0.778646,4449.357133,0.508167,3.790141,...,18.012771,36.304688,21559.003490,1,0.08,0.080000,0.840000,17619.701390,0.0,0.000000
141254,MOLAVE,2020,20678,0.0,655.685233,2.558333,0.861458,1521.435795,-0.174100,3.532580,...,13.163042,65.687266,12591.742022,1,0.00,0.420000,0.580000,5623.069564,0.0,0.000000
141255,MOLAVE,2020,20679,0.0,666.794635,2.975000,0.949479,930.647069,-0.244286,4.444498,...,10.901755,37.414996,19740.596834,1,0.00,0.109091,0.890909,5912.671746,0.0,0.015207
141256,MOLAVE,2020,20680,0.0,677.904037,2.889583,1.083333,1800.666044,0.038000,5.816195,...,17.917650,105.812452,26363.303778,1,0.03,0.250000,0.720000,11254.164413,0.0,0.020806


In [9]:
# Save this df to a CSV file
df2.to_csv("df2_housesdamaged5years.csv", index=False)