# Preprocessing

This preprocessing file should be utilized to do normalizations, feature extraction, etc.
Files created from this preprocessing are the files that should be used for merging (joining with the egg price).

### Naming
- Name the file "...._for_merge.csv"
- Use Path (to data folder) to store the file

# Import Files

In [1]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pathlib import Path

from datetime import datetime

# Settings

In [2]:
# path to where we store our preprocessed data
data_file_path = Path("../data")

# Load Files

In [3]:
df_us_disasters = pd.read_csv(f"{data_file_path}/date_organized_us_disasters.csv", index_col=0) # Load date preprocessed data.
df_us_diseases = pd.read_csv(f"{data_file_path}/date_organized_us_diseases.csv", index_col=0)
df_us_population = pd.read_csv(f"{data_file_path}/date_organized_us_population.csv",index_col=0)
df_us_covid = pd.read_csv(f"{data_file_path}/date_organized_us_covid.csv", index_col=0)
df_us_weather = pd.read_csv(f"{data_file_path}/date_organized_weather_agg.csv", index_col=0)

# Preprocessing for Natural Disaster

In [4]:
# Counts how many months a disaster lasted, distributes total cost and deaths across those months.
df_us_disasters['event_months'] = df_us_disasters.groupby(['name', 'disaster_type'])['date'].transform('count')
df_us_disasters['adjusted_cpi_cost'] = df_us_disasters['cpi_adjusted_cost'] / df_us_disasters['event_months']
df_us_disasters['adjusted_unadjusted_cost'] = df_us_disasters['unadjusted_cost'] / df_us_disasters['event_months']
df_us_disasters['adjusted_deaths'] = df_us_disasters['deaths'] / df_us_disasters['event_months']

In [6]:
df_us_disasters = df_us_disasters.groupby('date')[['adjusted_cpi_cost', 'adjusted_unadjusted_cost', 'adjusted_deaths']].sum().reset_index()

In [7]:
df_us_disasters.head(10)

Unnamed: 0,date,adjusted_cpi_cost,adjusted_unadjusted_cost,adjusted_deaths
0,1980-04,2749.399902,706.799988,7.0
1,1980-06,1130.033312,278.333333,35.0
2,1980-07,1130.033312,278.333333,35.0
3,1980-08,3366.233263,868.333333,48.0
4,1980-09,1130.033312,278.333333,35.0
5,1980-10,1130.033312,278.333333,35.0
6,1980-11,1130.033312,278.333333,35.0
7,1981-01,2076.399902,572.0,0.0
8,1981-05,1409.099976,401.399994,20.0
9,1982-01,2217.800049,662.0,85.0


In [8]:
# log transformation to normalize cost values
df_us_disasters['log_cpi_adjusted_cost'] = np.log1p(df_us_disasters['adjusted_cpi_cost'])  # log(1+x) to avoid log(0)
df_us_disasters['log_unadjusted_cost'] = np.log1p(df_us_disasters['adjusted_unadjusted_cost'])

df_us_disasters = df_us_disasters.drop(['adjusted_cpi_cost', 'adjusted_unadjusted_cost'], axis=1)
df_us_disasters.head(10)

Unnamed: 0,date,adjusted_deaths,log_cpi_adjusted_cost,log_unadjusted_cost
0,1980-04,7.0,7.919502,6.562162
1,1980-06,35.0,7.030887,5.632406
2,1980-07,35.0,7.030887,5.632406
3,1980-08,48.0,8.121847,6.767727
4,1980-09,35.0,7.030887,5.632406
5,1980-10,35.0,7.030887,5.632406
6,1980-11,35.0,7.030887,5.632406
7,1981-01,0.0,7.638872,6.350886
8,1981-05,20.0,7.251416,5.997447
9,1982-01,85.0,7.704722,6.496775


In [9]:
# Use this data to join 
df_us_disasters.to_csv(f'{data_file_path}/df_us_disasters_for_merge.csv')

# Preprocessing for Avian Flu in Birds and Humans

In [9]:
bird = pd.read_csv("../data/date_organized_avian_flu_bird.csv", index_col = 0)
human = pd.read_csv("../data/date_organized_avian_flu_human.csv", index_col = 0)

In [10]:
bird.head()

Unnamed: 0,FullGeoName,FIPS Codes,County Name,State,Outbreak Date,Flock Type,Flock Size,State Count,Outbreaks,Counties,yyyy_mm
0,"MI, Jackson",26075,Jackson,Michigan,2024-12-31,WOAH Non-Poultry,9,1,1,1,2024-12
1,"CA, Riverside",6065,Riverside,California,2024-12-31,Commercial Table Egg Layer; WOAH Non-Poultry; ...,1503370,1,5,1,2024-12
2,"CA, Butte",6007,Butte,California,2024-12-31,WOAH Non-Poultry; Commercial Raised for Releas...,45890,0,4,1,2024-12
3,"SD, Miner",46097,Miner,South Dakota,2024-12-31,WOAH Poultry,1500,1,1,1,2024-12
4,"MI, Ottawa",26139,Ottawa,Michigan,2024-12-31,Commercial Turkey Meat Bird,447700,0,7,1,2024-12


In [11]:
human.head()

Unnamed: 0,Entity,Code,Day,Human cases with highly pathogenic avian influenza A/H5N1 (monthly),yyyy_mm
0,Africa,,1997-01-01,0,1997-01
1,Africa,,1997-01-02,0,1997-01
2,Africa,,1997-01-03,0,1997-01
3,Africa,,1997-01-04,0,1997-01
4,Africa,,1997-01-05,0,1997-01


In [12]:
flocks_impacted = bird.groupby('yyyy_mm').size().reset_index(name = 'Flock_Count')
flocks_impacted.head()

Unnamed: 0,yyyy_mm,Flock_Count
0,2022-02,6
1,2022-03,31
2,2022-04,41
3,2022-05,30
4,2022-06,4


In [13]:
bird = bird.merge(flocks_impacted, on = 'yyyy_mm', how = 'left')
bird.head()

Unnamed: 0,FullGeoName,FIPS Codes,County Name,State,Outbreak Date,Flock Type,Flock Size,State Count,Outbreaks,Counties,yyyy_mm,Flock_Count
0,"MI, Jackson",26075,Jackson,Michigan,2024-12-31,WOAH Non-Poultry,9,1,1,1,2024-12,82
1,"CA, Riverside",6065,Riverside,California,2024-12-31,Commercial Table Egg Layer; WOAH Non-Poultry; ...,1503370,1,5,1,2024-12,82
2,"CA, Butte",6007,Butte,California,2024-12-31,WOAH Non-Poultry; Commercial Raised for Releas...,45890,0,4,1,2024-12,82
3,"SD, Miner",46097,Miner,South Dakota,2024-12-31,WOAH Poultry,1500,1,1,1,2024-12,82
4,"MI, Ottawa",26139,Ottawa,Michigan,2024-12-31,Commercial Turkey Meat Bird,447700,0,7,1,2024-12,82


In [14]:
flocks_impacted_size = bird.groupby('yyyy_mm')['Flock Size'].sum().reset_index(name = 'Total_Flock_Size')
bird = bird.merge(flocks_impacted_size, on = 'yyyy_mm', how = 'left')
bird.head()

Unnamed: 0,FullGeoName,FIPS Codes,County Name,State,Outbreak Date,Flock Type,Flock Size,State Count,Outbreaks,Counties,yyyy_mm,Flock_Count,Total_Flock_Size
0,"MI, Jackson",26075,Jackson,Michigan,2024-12-31,WOAH Non-Poultry,9,1,1,1,2024-12,82,36390154
1,"CA, Riverside",6065,Riverside,California,2024-12-31,Commercial Table Egg Layer; WOAH Non-Poultry; ...,1503370,1,5,1,2024-12,82,36390154
2,"CA, Butte",6007,Butte,California,2024-12-31,WOAH Non-Poultry; Commercial Raised for Releas...,45890,0,4,1,2024-12,82,36390154
3,"SD, Miner",46097,Miner,South Dakota,2024-12-31,WOAH Poultry,1500,1,1,1,2024-12,82,36390154
4,"MI, Ottawa",26139,Ottawa,Michigan,2024-12-31,Commercial Turkey Meat Bird,447700,0,7,1,2024-12,82,36390154


In [None]:
# log normalize the Total_Flock_Size column since the values are skewed in the millions. (e.g. 10mi, 25mil, 35mil, 300k, 400k, <100k are the common values)
bird['Total_Flock_Size'] = bird['Total_Flock_Size'].apply(lambda x: np.log(x) if x > 0 else 0)
bird.head()

Unnamed: 0,FullGeoName,FIPS Codes,County Name,State,Outbreak Date,Flock Type,Flock Size,State Count,Outbreaks,Counties,yyyy_mm,Flock_Count,Total_Flock_Size
0,"MI, Jackson",26075,Jackson,Michigan,2024-12-31,WOAH Non-Poultry,9,1,1,1,2024-12,82,17.409809
1,"CA, Riverside",6065,Riverside,California,2024-12-31,Commercial Table Egg Layer; WOAH Non-Poultry; ...,1503370,1,5,1,2024-12,82,17.409809
2,"CA, Butte",6007,Butte,California,2024-12-31,WOAH Non-Poultry; Commercial Raised for Releas...,45890,0,4,1,2024-12,82,17.409809
3,"SD, Miner",46097,Miner,South Dakota,2024-12-31,WOAH Poultry,1500,1,1,1,2024-12,82,17.409809
4,"MI, Ottawa",26139,Ottawa,Michigan,2024-12-31,Commercial Turkey Meat Bird,447700,0,7,1,2024-12,82,17.409809


In [17]:
people_impacted = human.groupby('yyyy_mm')['Human cases with highly pathogenic avian influenza A/H5N1 (monthly)'].sum().reset_index(name = 'People_Count')
people_impacted.head()

Unnamed: 0,yyyy_mm,People_Count
0,1997-01,54
1,1998-01,0
2,1999-01,0
3,2000-01,0
4,2001-01,0


In [19]:
bird.to_csv('../data/df_avian_flu_bird_for_merge.csv')
people_impacted.to_csv('../data/df_avian_flu_human_for_merge.csv')

In [11]:
# Normalized disease outbreak by us population
df_us_diseases = df_us_diseases[['date', 'us_human_outbreaks_cnt', 'us_human_illnesses_cnt']].copy()
df_us_diseases = df_us_diseases.merge(df_us_population, on='date', how='inner')

In [13]:
df_us_diseases['outbreaks_per_million'] = df_us_diseases['us_human_outbreaks_cnt']/df_us_diseases['population_million']
df_us_diseases['illnesses_per_million'] = df_us_diseases['us_human_illnesses_cnt']/df_us_diseases['population_million']

In [14]:
df_us_diseases_for_merge = df_us_diseases[['date', 'outbreaks_per_million', 'illnesses_per_million']].copy()

In [15]:
df_us_diseases_for_merge.to_csv(f'{data_file_path}/df_us_diseases_for_merge.csv')

# Preprocessing for covid data

In [17]:
df_us_covid = df_us_covid.merge(df_us_population, on='date', how='inner')

In [18]:
df_us_covid['hospitalized_per_million'] = df_us_covid['avg_daily_hospitalized']/df_us_covid['population_million']

In [20]:
df_us_covid_for_merge = df_us_covid[['date', 'hospitalized_per_million']].copy()

In [21]:
df_us_covid_for_merge.to_csv(f'{data_file_path}/df_us_covid_for_merge.csv')

# Preprocessing for Weather Data

In [10]:
print(df_us_weather)

        date    temp_WA    temp_IN    temp_MO    temp_WI
0    1980-01  31.851613  26.558065  31.693548  17.232258
1    1980-02  39.679310  21.993103  28.658621  16.448276
2    1980-03  41.074194  33.848387  39.796774  26.816129
3    1980-04  49.163333  48.016667  53.443333  44.306667
4    1980-05  52.761290  61.393548  64.667742  58.841935
..       ...        ...        ...        ...        ...
536  2024-09  60.206667  68.690000  70.476667  66.003333
537  2024-10  51.370968  58.016129  62.941935  54.748387
538  2024-11  43.463333  46.523333  48.440000  41.140000
539  2024-12  41.432258  33.738710  38.561290  26.116129
540  2025-01  36.654839  21.358065  25.474194  17.325806

[541 rows x 5 columns]


In [12]:
#Averaging the temperatures in the big egg-producing states for an overall temperature value.
df_us_weather['temp_overall'] = df_us_weather[['temp_IN', 'temp_MO', 'temp_WI']].mean(axis=1)

df_us_weather_for_merge = df_us_weather[['date', 'temp_overall']].copy()

In [14]:
df_us_weather_for_merge.to_csv(f'{data_file_path}/df_us_weather_for_merge.csv')