# Preprocessing

This preprocessing file should be utilized to do normalizations, feature extraction, etc.
Files created from this preprocessing are the files that should be used for merging (joining with the egg price).

### Naming
- Name the file "...._for_merge.csv"
- Use Path (to data folder) to store the file

# Import Files

In [None]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pathlib import Path

from datetime import datetime

# Settings

In [None]:
# path to where we store our preprocessed data
data_file_path = Path("../data")

# Load Files

In [None]:
df_us_disasters = pd.read_csv(f"{data_file_path}/date_organized_us_disasters.csv", index_col=0) # Load date preprocessed data.
df_us_diseases = pd.read_csv(f"{data_file_path}/date_organized_us_diseases.csv", index_col=0)
df_us_population = pd.read_csv(f"{data_file_path}/date_organized_us_population.csv",index_col=0)
df_us_covid = pd.read_csv(f"{data_file_path}/date_organized_us_covid.csv", index_col=0)
df_us_weather = pd.read_csv(f"{data_file_path}/date_organized_weather_agg.csv", index_col=0)

# Preprocessing for Natural Disaster

In [None]:
# Counts how many months a disaster lasted, distributes total cost and deaths across those months.
df_us_disasters['event_months'] = df_us_disasters.groupby(['name', 'disaster_type'])['date'].transform('count')
df_us_disasters['adjusted_cpi_cost'] = df_us_disasters['cpi_adjusted_cost'] / df_us_disasters['event_months']
df_us_disasters['adjusted_unadjusted_cost'] = df_us_disasters['unadjusted_cost'] / df_us_disasters['event_months']
df_us_disasters['adjusted_deaths'] = df_us_disasters['deaths'] / df_us_disasters['event_months']

In [None]:
df_us_disasters = df_us_disasters.groupby('date')[['adjusted_cpi_cost', 'adjusted_unadjusted_cost', 'adjusted_deaths']].sum().reset_index()

In [None]:
df_us_disasters.head(10)

In [None]:
# log transformation to normalize cost values
df_us_disasters['log_cpi_adjusted_cost'] = np.log1p(df_us_disasters['adjusted_cpi_cost'])  # log(1+x) to avoid log(0)
df_us_disasters['log_unadjusted_cost'] = np.log1p(df_us_disasters['adjusted_unadjusted_cost'])

df_us_disasters = df_us_disasters.drop(['adjusted_cpi_cost', 'adjusted_unadjusted_cost'], axis=1)
df_us_disasters.head(10)

In [None]:
# Use this data to join 
df_us_disasters.to_csv(f'{data_file_path}/df_us_disasters_for_merge.csv')

# Preprocessing for Avian Flu in Birds and Humans

In [None]:
bird = pd.read_csv("../data/date_organized_avian_flu_bird.csv", index_col = 0)
human = pd.read_csv("../data/date_organized_avian_flu_human.csv", index_col = 0)

In [None]:
bird.head()

In [None]:
human.head()

In [None]:
flocks_impacted = bird.groupby('yyyy_mm').size().reset_index(name = 'Flock_Count')
flocks_impacted.head()

In [None]:
bird = bird.merge(flocks_impacted, on = 'yyyy_mm', how = 'left')
bird.head()

In [None]:
flocks_impacted_size = bird.groupby('yyyy_mm')['Flock Size'].sum().reset_index(name = 'Total_Flock_Size')
bird = bird.merge(flocks_impacted_size, on = 'yyyy_mm', how = 'left')
bird.head()

In [None]:
# log normalize the Total_Flock_Size column since the values are skewed in the millions. (e.g. 10mi, 25mil, 35mil, 300k, 400k, <100k are the common values)
bird['Total_Flock_Size'] = bird['Total_Flock_Size'].apply(lambda x: np.log(x) if x > 0 else 0)
bird.head()

In [None]:
people_impacted = human.groupby('yyyy_mm')['Human cases with highly pathogenic avian influenza A/H5N1 (monthly)'].sum().reset_index(name = 'People_Count')
people_impacted.head()

In [None]:
bird.to_csv('../data/df_avian_flu_bird_for_merge.csv')
people_impacted.to_csv('../data/df_avian_flu_human_for_merge.csv')

In [None]:
# Normalized disease outbreak by us population
df_us_diseases = df_us_diseases[['date', 'us_human_outbreaks_cnt', 'us_human_illnesses_cnt']].copy()
df_us_diseases = df_us_diseases.merge(df_us_population, on='date', how='inner')

In [None]:
df_us_diseases['outbreaks_per_million'] = df_us_diseases['us_human_outbreaks_cnt']/df_us_diseases['population_million']
df_us_diseases['illnesses_per_million'] = df_us_diseases['us_human_illnesses_cnt']/df_us_diseases['population_million']

In [None]:
df_us_diseases_for_merge = df_us_diseases[['date', 'outbreaks_per_million', 'illnesses_per_million']].copy()

In [None]:
df_us_diseases_for_merge.to_csv(f'{data_file_path}/df_us_diseases_for_merge.csv')

# Preprocessing for covid data

In [None]:
df_us_covid = df_us_covid.merge(df_us_population, on='date', how='inner')

In [None]:
df_us_covid['hospitalized_per_million'] = df_us_covid['avg_daily_hospitalized']/df_us_covid['population_million']

In [None]:
df_us_covid_for_merge = df_us_covid[['date', 'hospitalized_per_million']].copy()

In [None]:
df_us_covid_for_merge.to_csv(f'{data_file_path}/df_us_covid_for_merge.csv')

# Preprocessing for Weather Data

In [None]:
print(df_us_weather)

In [None]:
#Averaging the temperatures in the big egg-producing states for an overall temperature value.
df_us_weather['temp_overall'] = df_us_weather[['temp_IN', 'temp_MO', 'temp_WI']].mean(axis=1)

weather_summer = pd.concat([df_us_weather[df_us_weather['date'].str[5:] == '06'],
                           df_us_weather[df_us_weather['date'].str[5:] == '07'],
                           df_us_weather[df_us_weather['date'].str[5:] == '08']])
weather_summer.sort_index(inplace=True) 

weather_fall = pd.concat([df_us_weather[df_us_weather['date'].str[5:] == '09'],
                           df_us_weather[df_us_weather['date'].str[5:] == '10'],
                           df_us_weather[df_us_weather['date'].str[5:] == '11']])
weather_fall.sort_index(inplace=True) 

weather_winter = pd.concat([df_us_weather[df_us_weather['date'].str[5:] == '12'],
                           df_us_weather[df_us_weather['date'].str[5:] == '01'],
                           df_us_weather[df_us_weather['date'].str[5:] == '02']])
weather_winter.sort_index(inplace=True) 

weather_spring = pd.concat([df_us_weather[df_us_weather['date'].str[5:] == '03'],
                           df_us_weather[df_us_weather['date'].str[5:] == '04'],
                           df_us_weather[df_us_weather['date'].str[5:] == '05']])
weather_spring.sort_index(inplace=True) 

In [None]:
weather_spring['temp_variance'] = abs(weather_spring['temp_overall'] - weather_spring['temp_overall'].mean())
weather_summer['temp_variance'] = abs(weather_summer['temp_overall'] - weather_summer['temp_overall'].mean())
weather_fall['temp_variance'] = abs(weather_fall['temp_overall'] - weather_fall['temp_overall'].mean())
weather_winter['temp_variance'] = abs(weather_winter['temp_overall'] - weather_winter['temp_overall'].mean())

print(weather_winter)

In [None]:
weather_over = pd.concat([weather_winter, weather_spring, weather_summer, weather_fall]).sort_values('date')
print(weather_over)

In [None]:
df_us_weather_for_merge = weather_over[['date', 'temp_overall', 'temp_variance']].copy()

In [None]:
df_us_weather_for_merge.to_csv(f'{data_file_path}/df_us_weather_for_merge.csv')