# Is climate an important characteristic of major power outages in the United States?

**Name(s)**: Ripudh Mylapur, Chia Lee

**Website Link**: (your website link)

## Code

In [1]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from sklearn.linear_model import LinearRegression

### Framing the Problem

In [2]:
import openpyxl
df = pd.read_excel(r"outage.xlsx", index_col = 1, header = 5)
df = df.drop(np.nan)
df = df.drop('variables', axis = 1)

def join_times(col_date, col_time):
    
    time = df[[col_date, col_time]]
    time = time.assign(date=pd.to_datetime(time[col_date]))
    time['date'] = time['date'].astype(str)
    time = time.assign(datetime=time['date'] + ' ' + time[col_time].astype(str))
    time = time.replace('NaT nan', np.nan)
    time = time.assign(fin=pd.to_datetime(time['datetime']))
    return time['fin']

out_start = join_times('OUTAGE.START.DATE', 'OUTAGE.START.TIME')
rest = join_times('OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME')
df = df.assign(out_start = out_start)
df = df.assign(rest_start = rest)
df = df.rename(columns = {'out_start': 'OUTAGE.START', 'rest_start': 'OUTAGE.RESTORATION'})


### Baseline Model

In [3]:
unique_regions = df['CLIMATE.REGION'].value_counts()
for reg in unique_regions.index:
    df[reg] = df['CLIMATE.REGION'].apply(lambda x: 1 if x == reg else 0)

In [8]:
df['CAUSE.CATEGORY']

OBS
1.0              severe weather
2.0          intentional attack
3.0              severe weather
4.0              severe weather
5.0              severe weather
                  ...          
1530.0            public appeal
1531.0    fuel supply emergency
1532.0                islanding
1533.0                islanding
1534.0        equipment failure
Name: CAUSE.CATEGORY, Length: 1534, dtype: object

In [5]:
df1 = df[['OUTAGE.DURATION', 'CUSTOMERS.AFFECTED', 'Northeast', 'South','West','Central', 'Southeast', 'East North Central', 'Northwest', 'Southwest', 'West North Central']]

In [6]:
df1 = df1.dropna()
df1

Unnamed: 0_level_0,OUTAGE.DURATION,CUSTOMERS.AFFECTED,Northeast,South,West,Central,Southeast,East North Central,Northwest,Southwest,West North Central
OBS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,3060,70000.0,0,0,0,0,0,1,0,0,0
3.0,3000,70000.0,0,0,0,0,0,1,0,0,0
4.0,2550,68200.0,0,0,0,0,0,1,0,0,0
5.0,1740,250000.0,0,0,0,0,0,1,0,0,0
6.0,1860,60000.0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1523.0,95,35000.0,0,0,0,0,0,0,1,0,0
1524.0,360,0.0,0,0,0,0,0,0,1,0,0
1525.0,1548,0.0,0,0,0,0,0,0,1,0,0
1527.0,0,0.0,0,0,0,0,0,0,1,0,0


In [7]:
model = LinearRegression()
y = df1[['OUTAGE.DURATION']]
X = df1[['CUSTOMERS.AFFECTED', 'Northeast', 'South','West','Central', 'Southeast', 'East North Central', 'Northwest', 'Southwest', 'West North Central']]
model.fit(X = X, y = y)

LinearRegression()

In [56]:
def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))

In [57]:
all_preds = model.predict(X)
rmse(df1['OUTAGE.DURATION'], all_preds.reshape(-1))

4375.282222460732

In [53]:
all_preds.reshape(-1)

array([3783.263788  , 3783.263788  , 3776.01621183, ..., 2089.84348567,
       2089.84348567, 1446.96868626])

### Final Model

In [None]:
# TODO

### Fairness Analysis

In [None]:
# TODO