# Date Preprocessing

This preprocessing file should be utilized to format the data to a format that can be joined by date.
- The date column should be named "date"
- The date column should have the format in "yyyy-mm"

### Convention
- We keep this separated from the final pre-processed files. 
- We save the date preprocessed files as a csv in the data folder so we don't have to call Redshift (too expensive)

# Import Files

In [None]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pathlib import Path
import functools as ft

from datetime import datetime

import redshift_connector

# Settings

In [None]:
# path to where we store our preprocessed data
data_file_path = Path("../data")

# Redshift Connection

In [None]:
conn = redshift_connector.connect(
    host='cspc-workgroup.783764604578.us-west-2.redshift-serverless.amazonaws.com',
    database='cspc5071-dsa',
    port=5439,
    user='python_user',
    password='Database123!' # Not best practice but who cares :))
 )
cursor = conn.cursor()

# Query to get the list of tables in the specified schema
query = f"""
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public'
AND table_type = 'BASE TABLE';
"""

cursor.execute(query)
tables = cursor.fetchall()

print(f"Tables in schema public:")
for table in tables:
    print(table[0]) # These are all the data we have in AWS redshift.

In [None]:
# Function to query to redshift: Returns a dataframe 
def querying_to_redshift(query, cursor):
    cursor.execute(query)
    df : pd.DataFrame = cursor.fetch_dataframe()

    return df

# Date Preprocessing for Natural Disasters

In [None]:
def expand_date_range(row):
    date_range = pd.date_range(row["start_date"], row["end_date"], freq="MS").strftime("%Y-%m")
    num_months = len(date_range)

    return pd.DataFrame({
        "name": row["name"],
        "disaster_type": row["disaster_type"],
        "date": date_range,
        "cpi_adjusted_cost": row["cpi_adjsuted_cost"] / num_months,
        "unadjusted_cost": row["unadjusted_cost"] / num_months,
        "deaths": row["deaths"] / num_months
    })

In [None]:
query = """SELECT * FROM "cspc5071-dsa"."public"."us_national_disaster";"""
df_us_disasters = querying_to_redshift(query, cursor)

copied_df = df_us_disasters.copy() # creating a deep copy so I don't have to fetch the database all the time

# Changing date in format of 'YYYYMMDD' to 'YYYY-MM'
df_us_disasters['start_date'] = pd.to_datetime(df_us_disasters['start_date'].astype(str), format='%Y%m%d').dt.strftime('%Y-%m')
df_us_disasters['end_date'] = pd.to_datetime(df_us_disasters['end_date'].astype(str), format='%Y%m%d').dt.strftime('%Y-%m')

df_us_disasters = pd.concat(df_us_disasters.apply(expand_date_range, axis=1).to_list(), ignore_index=True)

df_us_disasters.to_csv(f"{data_file_path}/date_organized_us_disasters.csv")  # save data

# Date Preprocessing for Avian Flu in Birds and Humans

In [None]:
bird = pd.read_csv("../data/data-table.csv")
bird.head()

In [None]:
bird['Outbreak Date'] = pd.to_datetime(bird['Outbreak Date'], format='%m-%d-%Y')
bird['yyyy_mm'] = bird['Outbreak Date'].dt.strftime('%Y-%m')

In [None]:
bird.head()

In [None]:
human = pd.read_csv("../data/h5n1-flu-reported-cases.csv")
human.head()

In [None]:
human['Day'] = pd.to_datetime(human['Day'], format = '%Y-%d-%m')
human['yyyy_mm'] = human['Day'].dt.strftime('%Y-%m')

In [None]:
human.head()

In [None]:
bird.to_csv(f"{data_file_path}/date_organized_avian_flu_bird.csv")
human.to_csv(f"{data_file_path}/date_organized_avian_flu_human.csv")

In [None]:
df_us_population.to_csv(f"{data_file_path}/date_organized_us_population.csv")

# Date preprocessing for Price Data

In [None]:
query = """SELECT * FROM "average_egg_price";"""
df_egg_price = querying_to_redshift(query, cursor)

In [None]:
df_egg_price['date'] = pd.to_datetime(df_egg_price['observation_date']).dt.strftime('%Y-%m')

In [None]:
df_egg_price = df_egg_price[['date', 'price_per_dozen']].copy()

In [None]:
df_egg_price.to_csv(f"{data_file_path}/date_organized_egg_price_for_merge.csv")

# Date preprocessing for covid-19 data

In [None]:
query = """SELECT * FROM "cspc5071-dsa"."public"."covid_hospitalization";"""
df_covid = querying_to_redshift(query, cursor)

In [None]:
df_covid_us = df_covid[df_covid['entity'] == 'United States'].copy()
df_covid_us['date'] = ((pd.to_datetime(df_covid_us['day'])).dt.to_period('M')).dt.strftime('%Y-%m')

In [None]:
df_covid_us = df_covid_us.groupby('date', as_index=False)['daily_hospital_occupancy'].mean().rename(columns={'daily_hospital_occupancy':'avg_daily_hospitalized'})

In [None]:
df_covid_us.to_csv(f"{data_file_path}/date_organized_us_covid.csv")

# Data preprocessing for Weather Data

In [None]:
data_WA = pd.read_csv(f"{data_file_path}/open-meteo-seattle.csv").iloc[2:, 0:2].rename(columns={'latitude':'day', 'longitude':'temp_WA'})
data_IN = pd.read_csv(f"{data_file_path}/open-meteo-indiana.csv").iloc[2:, 0:2].rename(columns={'latitude':'day', 'longitude':'temp_IN'})
data_MO = pd.read_csv(f"{data_file_path}/open-meteo-missouri.csv").iloc[2:, 0:2].rename(columns={'latitude':'day', 'longitude':'temp_MO'})
data_WI = pd.read_csv(f"{data_file_path}/open-meteo-wisconsin.csv").iloc[2:, 0:2].rename(columns={'latitude':'day', 'longitude':'temp_WI'})

In [None]:
df_weather = ft.reduce(lambda left,right: pd.merge(left,right,on=['day'],
                                            how='outer'), [data_WA, data_IN, data_MO, data_WI])

In [None]:
df_weather['date'] = ((pd.to_datetime(df_weather['day'])).dt.to_period('M')).dt.strftime('%Y-%m')
df_weather[['temp_WA', 'temp_IN', 'temp_MO', 'temp_WI']] = df_weather[['temp_WA', 'temp_IN', 'temp_MO', 'temp_WI']].apply(pd.to_numeric)
weather_agg = df_weather.groupby('date', as_index=False)[['temp_WA', 'temp_IN', 'temp_MO', 'temp_WI']].mean()

weather_agg.to_csv(f"{data_file_path}/date_organized_weather_agg.csv")

# Date Preprocessing for Gas Price Data

In [None]:
df_gas = pd.read_csv(f"{data_file_path}/raw_gas_price.csv")