<a href="https://colab.research.google.com/github/siyasathaye/power-outage-analysis/blob/main/notebook/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Your Title Here

**Name(s)**: Siya Sathaye & Risa Schloyer

**Website Link**: (your website link)

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import * # Feel free to uncomment and use this.

In [None]:
!pip install openpyxl



In [None]:
df = pd.read_excel("outage.xlsx")

## Step 1: Introduction

In [None]:
# TODO
# Question: How do severe weather events influence the duration and overall impact of major power outages across
# different regions in the United States?

## Step 2: Data Cleaning and Exploratory Data Analysis

In [None]:
# TODO
df = df.dropna(axis=1, how="all")

df = df.drop(columns=["OBS"], errors="ignore")

df["OUTAGE.START"] = pd.to_datetime(
    df["OUTAGE.START.DATE"].astype(str) + " " + df["OUTAGE.START.TIME"].astype(str),
    errors="coerce"
)
df["OUTAGE.RESTORATION"] = pd.to_datetime(
    df["OUTAGE.RESTORATION.DATE"].astype(str) + " " + df["OUTAGE.RESTORATION.TIME"].astype(str),
    errors="coerce"
)

df = df.drop(columns=[
    "OUTAGE.START.DATE",
    "OUTAGE.START.TIME",
    "OUTAGE.RESTORATION.DATE",
    "OUTAGE.RESTORATION.TIME"
])

num_cols = [
    "YEAR",
    "MONTH",
    "ANOMALY.LEVEL",
    "OUTAGE.DURATION",
    "DEMAND.LOSS.MW",
    "CUSTOMERS.AFFECTED"
]

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df["YEAR"] = df["YEAR"].astype("Int64")
df["MONTH"] = df["MONTH"].astype("Int64")

df = df.rename(columns={"OUTAGE.DURATION": "OUTAGE.DURATION.MIN"})

In [None]:
fig = px.histogram(
    df,
    x="OUTAGE.DURATION.MIN",
    nbins=50,
    title="Distribution of Outage Duration (minutes)"
)
fig.update_layout(
    xaxis_title="Minutes",
    yaxis_title="Frequency"
)


fig.show()

In [None]:
counts = df["CAUSE.CATEGORY"].value_counts()

fig = px.bar(
    x=counts.index,
    y=counts.values,
    title="Counts of Major Outages by Cause Category"
)
fig.update_layout(
    xaxis_title="Cause category",
    yaxis_title="Number of outages"
)
fig.show()


In [None]:
df["IS_SEVERE"] = df["CAUSE.CATEGORY"] == 'severe weather'

In [None]:
fig = px.box(
    df,
    x="IS_SEVERE",
    y="OUTAGE.DURATION.MIN",
    title="Outage Duration for Severe Weather vs Other Causes",
    labels={"IS_SEVERE": "Is severe weather?", "OUTAGE.DURATION.MIN": "Duration (minutes)"}
)
fig.show()

In [None]:
severe_df = df[df["IS_SEVERE"]]

fig = px.box(
    severe_df,
    x="NERC.REGION",
    y="OUTAGE.DURATION.MIN",
    title="Outage Duration for Severe Weather Events by NERC Region",
    labels={"OUTAGE.DURATION.MIN": "Duration (minutes)", "NERC.REGION": "NERC Region"}
)
fig.show()

In [None]:
cause_agg = (
    df.groupby("CAUSE.CATEGORY")["OUTAGE.DURATION.MIN"]
      .agg(["mean", "median", "count"])
)
cause_agg

Unnamed: 0_level_0,mean,median,count
CAUSE.CATEGORY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
equipment failure,1816.91,221.0,55
fuel supply emergency,13484.03,3960.0,38
intentional attack,429.98,56.0,403
islanding,200.55,77.5,44
public appeal,1468.45,455.0,69
severe weather,3883.99,2460.0,744
system operability disruption,728.87,215.0,123


In [None]:
df.groupby("CLIMATE.CATEGORY")["OUTAGE.DURATION.MIN"].agg(['mean', 'median', 'count'])

Unnamed: 0_level_0,mean,median,count
CLIMATE.CATEGORY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cold,2656.96,816.0,463
normal,2530.98,563.0,730
warm,2817.32,881.0,283


## Step 3: Assessment of Missingness

In [None]:
# TODO

## Step 4: Hypothesis Testing

In [None]:
# TODO

## Step 5: Framing a Prediction Problem

In [None]:
# TODO

## Step 6: Baseline Model

In [None]:
# TODO

## Step 7: Final Model

In [None]:
# TODO

## Step 8: Fairness Analysis

In [None]:
# TODO