# Data Preprocessing

This preprocessing file should be utilized to join the data that we have by date.
- The date column should be named "date"
- The date column should have the format in "yyyy-mm"

# Import Files

In [None]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pathlib import Path

from datetime import datetime


# Settings

In [None]:
# path to where we store our preprocessed data
data_file_path = Path("../data")


# Load Files

In [None]:
# Target data
df_egg_price = pd.read_csv(f"{data_file_path}/date_organized_egg_price._for_merge.csv", index_col=0)
# Features data
df_us_disasters = pd.read_csv(f"{data_file_path}/df_us_disasters_for_merge.csv", index_col=0)
df_us_diseases = pd.read_csv(f"{data_file_path}/df_us_diseases_for_merge.csv", index_col=0)
df_us_covid = pd.read_csv(f"{data_file_path}/df_us_covid_for_merge.csv", index_col=0)
df_us_weather = pd.read_csv(f"{data_file_path}/df_us_weather_for_merge.csv", index_col=0)
df_us_avian_flu_bird = pd.read_csv(f"{data_file_path}/df_avian_flu_bird_for_merge.csv", index_col=0)
df_us_avian_flu_human = pd.read_csv(f"{data_file_path}/df_avian_flu_human_for_merge.csv", index_col=0)


In [None]:
df_us_avian_flu_bird = df_us_avian_flu_bird[['yyyy_mm', 'Flock_Count', 'Total_Flock_Size']].copy().rename(columns = {'yyyy_mm': 'date', 'Flock_Count': 'infected_flock_cnt', 'Total_Flock_Size': 'infected_bird_cnt'})
df_us_avian_flu_bird.drop_duplicates(inplace=True)

In [None]:
df_us_avian_flu_human = df_us_avian_flu_human.rename(columns={'yyyy_mm': 'date', 'People_Count': 'infected_h5n1_people_cnt'})

In [None]:
df_us_disasters.drop_duplicates()

In [None]:
df_full_data = df_egg_price.copy()
# Merge dataset
df_full_data = df_full_data.merge(df_us_disasters, on='date', how='left').rename(columns={'adjusted_deaths': 'disaster_deaths_adjusted', 'log_cpi_adjusted_cost': 'disaster_cost_adjusted', 'log_unadjusted_cost': 'disaster_cost_unadjusted'})
df_full_data = df_full_data.merge(df_us_diseases, on='date', how='left').rename(columns={'outbreaks_per_million': 'human_outbreaks_per_million', 'illnesses_per_million': 'human_illnesses_per_million'})
df_full_data = df_full_data.merge(df_us_covid, on='date', how='left').rename(columns={'hospitalized_per_million': 'covid_hospitalization_per_million'})
df_full_data = df_full_data.merge(df_us_weather, on='date', how='left')
df_full_data = df_full_data.merge(df_us_avian_flu_bird, on='date', how='left')
df_full_data = df_full_data.merge(df_us_avian_flu_human, on='date', how='left')

In [None]:
# Create a new target variable for price changes
df_full_data['change_in_price_per_dozen'] = df_full_data['price_per_dozen'].diff()

In [None]:
df_full_data.fillna(0, inplace=True)

In [None]:
df_full_data.columns

In [None]:
df_float_data = df_full_data[['price_per_dozen', 'change_in_price_per_dozen', 'disaster_deaths_adjusted', 'disaster_cost_adjusted', 
                               'human_outbreaks_per_million', 'human_illnesses_per_million', 
                              'covid_hospitalization_per_million', 'infected_flock_cnt', 'infected_bird_cnt', 'infected_h5n1_people_cnt', 'temp_overall']].copy()

In [None]:
target_column = 'price_per_dozen'
other_columns = list(df_float_data.columns)
other_columns.remove('price_per_dozen')
other_columns.remove('change_in_price_per_dozen')
fig, ax = plt.subplots(1, 9, figsize=(40, 5))
for i, column in enumerate(other_columns):
    if column != target_column:  # Avoid plotting the target column against itself
        ax[i].scatter(df_float_data[target_column], df_float_data[column])
        ax[i].set_title(f'{target_column} vs \n{column}')
        ax[i].set_xlabel(target_column)
        ax[i].set_ylabel(column)
plt.show()

In [None]:
target_column = 'change_in_price_per_dozen'
other_columns = list(df_float_data.columns)
other_columns.remove('price_per_dozen')
other_columns.remove('change_in_price_per_dozen')
fig, ax = plt.subplots(1, 9, figsize=(40, 5))
for i, column in enumerate(other_columns):
    if column != target_column:  # Avoid plotting the target column against itself
        ax[i].scatter(df_float_data[target_column], df_float_data[column])
        ax[i].set_title(f'{target_column} vs \n{column}')
        ax[i].set_xlabel(target_column)
        ax[i].set_ylabel(column)
plt.show()

In [None]:
# Save full data
df_full_data.to_csv(f"{data_file_path}/merged_data.csv")

In [None]:
df_full_data.shape

In [None]:
df_egg_price.shape