# Create a historical variable

#### In the Historical Risk Model, a new variable named “percent_buildings_damaged_5years” was created to add to the dataset. 
#### For each location/data point, we create a variable that is the average of damages in the past 5 years of typhoons in our data.



In [1]:
%load_ext jupyter_black

In [2]:
import statistics
import numpy as np
import pandas as pd

from utils import get_training_dataset

In [3]:
# Read csv file and import to df
df = get_training_dataset()
df

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,percent_houses_damaged
0,DURIAN,2006,101,0.0,303.180555,0.122917,0.085417,31.000000,,0.0
1,DURIAN,2006,4475,0.0,638.027502,0.091667,0.027083,3.301020,-0.527000,0.0
2,DURIAN,2006,4639,0.0,603.631997,0.535417,0.146354,12.103741,-0.283000,0.0
3,DURIAN,2006,4640,0.0,614.675270,0.356250,0.101562,645.899660,-0.358889,0.0
4,DURIAN,2006,4641,0.0,625.720905,0.202083,0.057812,1071.731293,-0.462800,0.0
...,...,...,...,...,...,...,...,...,...,...
141253,MOLAVE,2020,20677,0.0,644.575831,2.543750,0.778646,4449.357133,0.508167,0.0
141254,MOLAVE,2020,20678,0.0,655.685233,2.558333,0.861458,1521.435795,-0.174100,0.0
141255,MOLAVE,2020,20679,0.0,666.794635,2.975000,0.949479,930.647069,-0.244286,0.0
141256,MOLAVE,2020,20680,0.0,677.904037,2.889583,1.083333,1800.666044,0.038000,0.0


In [4]:
df_avgDmgCell_and_Year = df.groupby(["typhoon_year", "grid_point_id"], as_index=False)[
    "percent_houses_damaged"
].mean()
df_avgDmgCell_and_Year

Unnamed: 0,typhoon_year,grid_point_id,percent_houses_damaged
0,2006,101,0.0
1,2006,4475,0.0
2,2006,4639,0.0
3,2006,4640,0.0
4,2006,4641,0.0
...,...,...,...
47081,2020,20677,0.0
47082,2020,20678,0.0
47083,2020,20679,0.0
47084,2020,20680,0.0


In [5]:
df_res2 = (
    df_avgDmgCell_and_Year.groupby("grid_point_id")
    .rolling(5, min_periods=1)
    .agg({"percent_houses_damaged": "mean", "typhoon_year": "max"})
)

df_res2 = df_res2.rename(
    columns={"percent_houses_damaged": "percent_houses_damaged_5years"}
)

In [6]:
df_res2["typhoon_year"] = df_res2["typhoon_year"] + 1
df_res2

Unnamed: 0_level_0,Unnamed: 1_level_0,percent_houses_damaged_5years,typhoon_year
grid_point_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101,0,0.000000,2007.0
101,3622,0.000000,2009.0
101,7244,0.000000,2010.0
101,10866,0.000000,2011.0
101,14488,0.000000,2012.0
...,...,...,...
20681,32597,0.170235,2016.0
20681,36219,0.170235,2017.0
20681,39841,0.002309,2019.0
20681,43463,0.001050,2020.0


In [7]:
df2 = df.merge(df_res2, on=["typhoon_year", "grid_point_id"], how="left")
df2["percent_houses_damaged_5years"] = df2["percent_houses_damaged_5years"].fillna(0)

df2

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,percent_houses_damaged,percent_houses_damaged_5years
0,DURIAN,2006,101,0.0,303.180555,0.122917,0.085417,31.000000,,0.0,0.000000
1,DURIAN,2006,4475,0.0,638.027502,0.091667,0.027083,3.301020,-0.527000,0.0,0.000000
2,DURIAN,2006,4639,0.0,603.631997,0.535417,0.146354,12.103741,-0.283000,0.0,0.000000
3,DURIAN,2006,4640,0.0,614.675270,0.356250,0.101562,645.899660,-0.358889,0.0,0.000000
4,DURIAN,2006,4641,0.0,625.720905,0.202083,0.057812,1071.731293,-0.462800,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
141253,MOLAVE,2020,20677,0.0,644.575831,2.543750,0.778646,4449.357133,0.508167,0.0,0.000000
141254,MOLAVE,2020,20678,0.0,655.685233,2.558333,0.861458,1521.435795,-0.174100,0.0,0.000000
141255,MOLAVE,2020,20679,0.0,666.794635,2.975000,0.949479,930.647069,-0.244286,0.0,0.015207
141256,MOLAVE,2020,20680,0.0,677.904037,2.889583,1.083333,1800.666044,0.038000,0.0,0.020806


In [8]:
# df1 = df.groupby(["typhoon_year", "grid_point_id"])["percent_buildings_damaged"].mean()

df2.to_csv("df2_housesdamaged5years.csv", index=False)