# Predicting the Severity of Major Power Outages in the U.S.

**Name(s)**: Pratham Aggarwal

**Website Link**: https://pratham-aggr.github.io/power_outage

In [115]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.io as pio
from dsc80_utils import *
from project import *
import folium
import requests
from scipy import stats
import plotly.express as px
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go

## Step 1: Introduction

In [117]:
# TODO
# data_dct = https://www.sciencedirect.com/science/article/pii/S2352340918307182
fp = Path('data') / 'outage.csv'
raw_df = pd.read_csv(fp, skiprows=5)

## Step 2: Data Cleaning and Exploratory Data Analysis

In [118]:
# TODO
df = raw_df.copy(deep=True)
df.columns = [col.lower() for col in df.columns]
units = df.iloc[0]

df = df.iloc[1:].reset_index(drop=True)

new_columns = []
for col, unit in zip(df.columns, units):
    if pd.notna(unit):
        new_columns.append(f"{col} ({unit})")
    else:
        new_columns.append(col)

df.columns = new_columns
df.columns = df.columns.str.lower()

categorical_vars, numeric_vars, datetime_vars = get_variable_lists()

for col in numeric_vars:
    df[col] = df[col].astype(float)
    
#state abbreiation are df['postal.code'] hence drop it
#dropping outage start and restore and making a single col for start time/date and end; month/year is redundant info
#hurrican name is useless for our analysis
df = df.rename(columns={
    "outage.start.date (day of the week, month day, year)": "start_date",
    "outage.start.time (hour:minute:second (am / pm))": "start_time",
    "outage.restoration.date (day of the week, month day, year)": "restore_date",
    "outage.restoration.time (hour:minute:second (am / pm))": "restore_time"
})
fmt = "%A, %B %d, %Y %I:%M:%S %p"

df["outage_start"] = pd.to_datetime(df["start_date"] + " " + df["start_time"], format=fmt)
df["outage_restore"] = pd.to_datetime(df["restore_date"] + " " + df["restore_time"], format=fmt)
df["dur_hours"] = (df["outage_restore"] - df["outage_start"]).dt.total_seconds() / 3600

#since outage.duration is linearly dependent on outage_start & outage_restore, to I will drop it to avoid redundant info

df = df.drop(
    columns=[
        "start_date", 
        "start_time", 
        "restore_date", 
        "restore_time", 
        "year", 
        "month", 
        "hurricane.names",
        "outage.duration (mins)",
        'obs',
        'variables (units)', 
        'postal.code'
    ]
)

In [119]:
plot_multiple_bars(df, categorical_vars ,title = 'Distributions')

In [120]:
fig1 = plot_single_bar(df,'cause.category')
fig2 = plot_single_bar(df,'climate.region', 'red')
fig1.show()
fig2.show()

In [121]:
fig = px.box(
    df,
    x = 'cause.category',
    y = 'dur_hours',
    title = 'Avergae Outage Duration by Casue Category'
)

fig.update_layout(
    height = 500,
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)'
)

fig.update_yaxes(type='log') #for visibility since vanilla plot is not legible
fig.show()
#courtesy plotly boxplots reference: https://plotly.com/python/box-plots

In [None]:
plot_state_choropleth(df, 'dur_hours', aggfunc = 'mean')

## Step 3: Assessment of Missingness

In [None]:
figs = ['pct_water_tot (%)', 'com.sales (megawatt-hour)']

for col in figs:
    fig = create_kde_plotly(
        df = df, 
        group_col = 'anomaly.level (numeric)', 
        group1 = True, 
        group2 = False, 
        vals_col = col, 
        title=f'KDE: anomaly.level vs {col}'
    )
    fig.update_layout(
    width=600,
    height=500
    )
    fig.show()


def ks_perm_test(df, col1, col2):
    missing = df[df[col1].isna()][col2].dropna().values
    not_missing = df[df[col1].notna()][col2].dropna().values
    obs = stats.ks_2samp(missing, not_missing).statistic
    comb = np.concatenate([missing, not_missing])
    perm_stats = []
    for _ in range(10_000):
        perm = np.random.permutation(comb)
        perm_miss = perm[:len(missing)]
        perm_not_miss = perm[len(missing):]
        perm_stat = stats.ks_2samp(perm_miss, perm_not_miss).statistic
        perm_stats.append(perm_stat)
    
    perm_stats = np.array(perm_stats)
    p_val = np.mean(perm_stats >= obs)
    return p_val
    
pval_no = ks_perm_test(df, 'anomaly.level (numeric)', 'pct_water_tot (%)')
pval_yes = ks_perm_test(df, 'anomaly.level (numeric)', 'util.contri (%)')

print(pval_no, pval_yes)

0.8247 0.0001


## Step 4: Hypothesis Testing

In [124]:
#imputing values in climate.category & dur_hours
pct_empty = df['climate.category'].isna().mean()*100
#only 0.5% of the data is empty best strategy to fill nans would be to randomly pick from [normal, cold, warm] 
#with probabilites we see in the data
probs = df['climate.category'].value_counts(normalize = True).values
df['climate.category.encoded'] = df['climate.category']

dct = {
    'normal': 0,
    'cold': 1,
    'warm': 2
}
#caution giving higher values may bias my model so will use one hot encoding in modeling step
df['climate.category.encoded'] = (
    df['climate.category.encoded']
    .fillna(np.random.choice(3, p = probs))
    .apply(lambda x: dct[x]if isinstance(x,str) else x)
)
#my hypothesis says that climate category is dependent on outage duration but to avoid bias in hypoethesis testing
#I will be imputing with values in with median since mean is skewed
median = df['dur_hours'].median()
df['dur_hours'] = df['dur_hours'].fillna(median)

In [131]:
#perm test to examine whether outage duration depends on climate.category (there is an obvious yes atm)
df['is_normal_climate'] = df['climate.category.encoded'].apply(lambda x: not(x))

fig = create_kde_plotly(
    df = df, 
    group_col = 'is_normal_climate', 
    group1 = True, 
    group2 = False, 
    vals_col = 'dur_hours', 
    title=f'KDE: anomaly.level vs {col}'
)
fig.update_layout(
    width=600,
    height=500
)

fig.show()
#since dist shape is quite similar, I will be using ks stat

def ks_perm_test_gen(df, col1, group, col2):
    g1 = df[df[col1] == group][col2].dropna().values
    g2 = df[df[col1]!=group][col2].dropna().values
    obs = stats.ks_2samp(g1, g2).statistic
    comb = np.concatenate([g1, g2])
    perm_stats = []
    for _ in range(10_000):
        perm =np.random.permutation(comb)
        perm_g1 = perm[:len(g1)]
        perm_g2 = perm[len(g1):]
        perm_stat = stats.ks_2samp(perm_g1, perm_g2).statistic
        perm_stats.append(perm_stat)
    
    perm_stats = np.array(perm_stats)
    p_val = np.mean(perm_stats >= obs)
    return p_val

ks_perm_test_gen(df, 'is_normal_climate', True, 'dur_hours')

np.float64(0.0557)

## Step 5: Framing a Prediction Problem

In [126]:
# TODO

## Step 6: Baseline Model

In [158]:
# TODO

## Step 7: Final Model

In [128]:
# TODO

## Step 8: Fairness Analysis

In [129]:
# TODO