# Preprocessing

This preprocessing file should be utilized to do normalizations, feature extraction, etc.

# Import Files

In [2]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pathlib import Path

from datetime import datetime

import redshift_connector

# Settings

In [23]:
# path to where we store our preprocessed data
data_file_path = Path("../data")

# Load Files

In [45]:
df_us_disasters = pd.read_csv(f"{data_file_path}/date_organized_us_disasters.csv", index_col=0) # Load date preprocessed data.

# Preprocessing for Natural Disaster

In [46]:
# Counts how many months a disaster lasted, distributes total cost and deaths across those months.
df_us_disasters['event_months'] = df_us_disasters.groupby(['name', 'disaster_type'])['date'].transform('count')
df_us_disasters['adjusted_cpi_cost'] = df_us_disasters['cpi_adjusted_cost'] / df_us_disasters['event_months']
df_us_disasters['adjusted_unadjusted_cost'] = df_us_disasters['unadjusted_cost'] / df_us_disasters['event_months']
df_us_disasters['adjusted_deaths'] = df_us_disasters['deaths'] / df_us_disasters['event_months']

# log transformation to normalize cost values
df_us_disasters['log_cpi_adjusted_cost'] = np.log1p(df_us_disasters['adjusted_cpi_cost'])  # log(1+x) to avoid log(0)
df_us_disasters['log_unadjusted_cost'] = np.log1p(df_us_disasters['adjusted_unadjusted_cost'])

df_us_disasters = df_us_disasters.drop(['cpi_adjusted_cost', 'unadjusted_cost', 'event_months', 'adjusted_cpi_cost', 'adjusted_unadjusted_cost', 'deaths', 'name'], axis=1)

In [47]:
df_us_disasters = pd.get_dummies(df_us_disasters, columns=['disaster_type'], drop_first=True)

# Convert only the one-hot-encoded columns to integers
one_hot_cols = df_us_disasters.filter(like='disaster_type_').columns
df_us_disasters[one_hot_cols] = df_us_disasters[one_hot_cols].astype(int)

In [51]:
df_us_disasters.head() # Use this data to join 
df_us_disasters.to_csv(f'{data_file_path}/df_us_disasters_for_merge.csv')

# Preprocessing for Avian Flu in Birds and Humans

In [9]:
bird = pd.read_csv("../data/date_organized_avian_flu_bird.csv", index_col = 0)
human = pd.read_csv("../data/date_organized_avian_flu_human.csv", index_col = 0)

In [10]:
bird.head()

Unnamed: 0,FullGeoName,FIPS Codes,County Name,State,Outbreak Date,Flock Type,Flock Size,State Count,Outbreaks,Counties,yyyy_mm
0,"MI, Jackson",26075,Jackson,Michigan,2024-12-31,WOAH Non-Poultry,9,1,1,1,2024-12
1,"CA, Riverside",6065,Riverside,California,2024-12-31,Commercial Table Egg Layer; WOAH Non-Poultry; ...,1503370,1,5,1,2024-12
2,"CA, Butte",6007,Butte,California,2024-12-31,WOAH Non-Poultry; Commercial Raised for Releas...,45890,0,4,1,2024-12
3,"SD, Miner",46097,Miner,South Dakota,2024-12-31,WOAH Poultry,1500,1,1,1,2024-12
4,"MI, Ottawa",26139,Ottawa,Michigan,2024-12-31,Commercial Turkey Meat Bird,447700,0,7,1,2024-12


In [11]:
human.head()

Unnamed: 0,Entity,Code,Day,Human cases with highly pathogenic avian influenza A/H5N1 (monthly),yyyy_mm
0,Africa,,1997-01-01,0,1997-01
1,Africa,,1997-01-02,0,1997-01
2,Africa,,1997-01-03,0,1997-01
3,Africa,,1997-01-04,0,1997-01
4,Africa,,1997-01-05,0,1997-01


In [12]:
flocks_impacted = bird.groupby('yyyy_mm').size().reset_index(name = 'Flock_Count')
flocks_impacted.head()

Unnamed: 0,yyyy_mm,Flock_Count
0,2022-02,6
1,2022-03,31
2,2022-04,41
3,2022-05,30
4,2022-06,4


In [13]:
bird = bird.merge(flocks_impacted, on = 'yyyy_mm', how = 'left')
bird.head()

Unnamed: 0,FullGeoName,FIPS Codes,County Name,State,Outbreak Date,Flock Type,Flock Size,State Count,Outbreaks,Counties,yyyy_mm,Flock_Count
0,"MI, Jackson",26075,Jackson,Michigan,2024-12-31,WOAH Non-Poultry,9,1,1,1,2024-12,82
1,"CA, Riverside",6065,Riverside,California,2024-12-31,Commercial Table Egg Layer; WOAH Non-Poultry; ...,1503370,1,5,1,2024-12,82
2,"CA, Butte",6007,Butte,California,2024-12-31,WOAH Non-Poultry; Commercial Raised for Releas...,45890,0,4,1,2024-12,82
3,"SD, Miner",46097,Miner,South Dakota,2024-12-31,WOAH Poultry,1500,1,1,1,2024-12,82
4,"MI, Ottawa",26139,Ottawa,Michigan,2024-12-31,Commercial Turkey Meat Bird,447700,0,7,1,2024-12,82


In [14]:
flocks_impacted_size = bird.groupby('yyyy_mm')['Flock Size'].sum().reset_index(name = 'Total_Flock_Size')
bird = bird.merge(flocks_impacted_size, on = 'yyyy_mm', how = 'left')
bird.head()

Unnamed: 0,FullGeoName,FIPS Codes,County Name,State,Outbreak Date,Flock Type,Flock Size,State Count,Outbreaks,Counties,yyyy_mm,Flock_Count,Total_Flock_Size
0,"MI, Jackson",26075,Jackson,Michigan,2024-12-31,WOAH Non-Poultry,9,1,1,1,2024-12,82,36390154
1,"CA, Riverside",6065,Riverside,California,2024-12-31,Commercial Table Egg Layer; WOAH Non-Poultry; ...,1503370,1,5,1,2024-12,82,36390154
2,"CA, Butte",6007,Butte,California,2024-12-31,WOAH Non-Poultry; Commercial Raised for Releas...,45890,0,4,1,2024-12,82,36390154
3,"SD, Miner",46097,Miner,South Dakota,2024-12-31,WOAH Poultry,1500,1,1,1,2024-12,82,36390154
4,"MI, Ottawa",26139,Ottawa,Michigan,2024-12-31,Commercial Turkey Meat Bird,447700,0,7,1,2024-12,82,36390154


In [None]:
# log normalize the Total_Flock_Size column since the values are skewed in the millions. (e.g. 10mi, 25mil, 35mil, 300k, 400k, <100k are the common values)
bird['Total_Flock_Size'] = bird['Total_Flock_Size'].apply(lambda x: np.log(x) if x > 0 else 0)
bird.head()

Unnamed: 0,FullGeoName,FIPS Codes,County Name,State,Outbreak Date,Flock Type,Flock Size,State Count,Outbreaks,Counties,yyyy_mm,Flock_Count,Total_Flock_Size
0,"MI, Jackson",26075,Jackson,Michigan,2024-12-31,WOAH Non-Poultry,9,1,1,1,2024-12,82,17.409809
1,"CA, Riverside",6065,Riverside,California,2024-12-31,Commercial Table Egg Layer; WOAH Non-Poultry; ...,1503370,1,5,1,2024-12,82,17.409809
2,"CA, Butte",6007,Butte,California,2024-12-31,WOAH Non-Poultry; Commercial Raised for Releas...,45890,0,4,1,2024-12,82,17.409809
3,"SD, Miner",46097,Miner,South Dakota,2024-12-31,WOAH Poultry,1500,1,1,1,2024-12,82,17.409809
4,"MI, Ottawa",26139,Ottawa,Michigan,2024-12-31,Commercial Turkey Meat Bird,447700,0,7,1,2024-12,82,17.409809


In [17]:
people_impacted = human.groupby('yyyy_mm')['Human cases with highly pathogenic avian influenza A/H5N1 (monthly)'].sum().reset_index(name = 'People_Count')
people_impacted.head()

Unnamed: 0,yyyy_mm,People_Count
0,1997-01,54
1,1998-01,0
2,1999-01,0
3,2000-01,0
4,2001-01,0


In [19]:
bird.to_csv('../data/df_avian_flu_bird_for_merge.csv')
people_impacted.to_csv('../data/df_avian_flu_human_for_merge.csv')