# Preprocessing

This preprocessing file should be utilized to do normalizations, feature extraction, etc.
Files created from this preprocessing are the files that should be used for merging (joining with the egg price).

### Naming
- Name the file "...._for_merge.csv"
- Use Path (to data folder) to store the file

# Import Files

In [22]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pathlib import Path

from datetime import datetime

# Settings

In [23]:
# path to where we store our preprocessed data
data_file_path = Path("../data")

# Load Files

In [45]:
df_us_disasters = pd.read_csv(f"{data_file_path}/date_organized_us_disasters.csv", index_col=0) # Load date preprocessed data.

# Preprocessing for Natural Disaster

In [46]:
# Counts how many months a disaster lasted, distributes total cost and deaths across those months.
df_us_disasters['event_months'] = df_us_disasters.groupby(['name', 'disaster_type'])['date'].transform('count')
df_us_disasters['adjusted_cpi_cost'] = df_us_disasters['cpi_adjusted_cost'] / df_us_disasters['event_months']
df_us_disasters['adjusted_unadjusted_cost'] = df_us_disasters['unadjusted_cost'] / df_us_disasters['event_months']
df_us_disasters['adjusted_deaths'] = df_us_disasters['deaths'] / df_us_disasters['event_months']

# log transformation to normalize cost values
df_us_disasters['log_cpi_adjusted_cost'] = np.log1p(df_us_disasters['adjusted_cpi_cost'])  # log(1+x) to avoid log(0)
df_us_disasters['log_unadjusted_cost'] = np.log1p(df_us_disasters['adjusted_unadjusted_cost'])

df_us_disasters = df_us_disasters.drop(['cpi_adjusted_cost', 'unadjusted_cost', 'event_months', 'adjusted_cpi_cost', 'adjusted_unadjusted_cost', 'deaths', 'name'], axis=1)

In [47]:
df_us_disasters = pd.get_dummies(df_us_disasters, columns=['disaster_type'], drop_first=True)

# Convert only the one-hot-encoded columns to integers
one_hot_cols = df_us_disasters.filter(like='disaster_type_').columns
df_us_disasters[one_hot_cols] = df_us_disasters[one_hot_cols].astype(int)

In [51]:
df_us_disasters.head() # Use this data to join 
df_us_disasters.to_csv(f'{data_file_path}/df_us_disasters_for_merge.csv')