# Is climate an important characteristic of major power outages in the United States?

**Name(s)**: Ripudh Mylapur, Chia Lee

**Website Link**: (your website link)

## Code

In [33]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

### Framing the Problem

In [3]:
import openpyxl
df = pd.read_excel(r"outage.xlsx", index_col = 1, header = 5)
df = df.drop(np.nan)
df = df.drop('variables', axis = 1)

def join_times(col_date, col_time):
    
    time = df[[col_date, col_time]]
    time = time.assign(date=pd.to_datetime(time[col_date]))
    time['date'] = time['date'].astype(str)
    time = time.assign(datetime=time['date'] + ' ' + time[col_time].astype(str))
    time = time.replace('NaT nan', np.nan)
    time = time.assign(fin=pd.to_datetime(time['datetime']))
    return time['fin']

out_start = join_times('OUTAGE.START.DATE', 'OUTAGE.START.TIME')
rest = join_times('OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME')
df = df.assign(out_start = out_start)
df = df.assign(rest_start = rest)
df = df.rename(columns = {'out_start': 'OUTAGE.START', 'rest_start': 'OUTAGE.RESTORATION'})


In [4]:
df

Unnamed: 0_level_0,YEAR,MONTH,U.S._STATE,POSTAL.CODE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,CLIMATE.CATEGORY,OUTAGE.START.DATE,OUTAGE.START.TIME,...,POPDEN_URBAN,POPDEN_UC,POPDEN_RURAL,AREAPCT_URBAN,AREAPCT_UC,PCT_LAND,PCT_WATER_TOT,PCT_WATER_INLAND,OUTAGE.START,OUTAGE.RESTORATION
OBS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,2011.0,7.0,Minnesota,MN,MRO,East North Central,-0.3,normal,2011-07-01 00:00:00,17:00:00,...,2279,1700.5,18.2,2.14,0.6,91.592666,8.407334,5.478743,2011-07-01 17:00:00,2011-07-03 20:00:00
2.0,2014.0,5.0,Minnesota,MN,MRO,East North Central,-0.1,normal,2014-05-11 00:00:00,18:38:00,...,2279,1700.5,18.2,2.14,0.6,91.592666,8.407334,5.478743,2014-05-11 18:38:00,2014-05-11 18:39:00
3.0,2010.0,10.0,Minnesota,MN,MRO,East North Central,-1.5,cold,2010-10-26 00:00:00,20:00:00,...,2279,1700.5,18.2,2.14,0.6,91.592666,8.407334,5.478743,2010-10-26 20:00:00,2010-10-28 22:00:00
4.0,2012.0,6.0,Minnesota,MN,MRO,East North Central,-0.1,normal,2012-06-19 00:00:00,04:30:00,...,2279,1700.5,18.2,2.14,0.6,91.592666,8.407334,5.478743,2012-06-19 04:30:00,2012-06-20 23:00:00
5.0,2015.0,7.0,Minnesota,MN,MRO,East North Central,1.2,warm,2015-07-18 00:00:00,02:00:00,...,2279,1700.5,18.2,2.14,0.6,91.592666,8.407334,5.478743,2015-07-18 02:00:00,2015-07-19 07:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1530.0,2011.0,12.0,North Dakota,ND,MRO,West North Central,-0.9,cold,2011-12-06 00:00:00,08:00:00,...,2192.2,1868.2,3.9,0.27,0.1,97.599649,2.401765,2.401765,2011-12-06 08:00:00,2011-12-06 20:00:00
1531.0,2006.0,,North Dakota,ND,MRO,West North Central,,,,,...,2192.2,1868.2,3.9,0.27,0.1,97.599649,2.401765,2.401765,NaT,NaT
1532.0,2009.0,8.0,South Dakota,SD,RFC,West North Central,0.5,warm,2009-08-29 00:00:00,22:54:00,...,2038.3,1905.4,4.7,0.3,0.15,98.307744,1.692256,1.692256,2009-08-29 22:54:00,2009-08-29 23:53:00
1533.0,2009.0,8.0,South Dakota,SD,MRO,West North Central,0.5,warm,2009-08-29 00:00:00,11:00:00,...,2038.3,1905.4,4.7,0.3,0.15,98.307744,1.692256,1.692256,2009-08-29 11:00:00,2009-08-29 14:01:00


### Baseline Model

In [5]:
unique_regions = df['CLIMATE.REGION'].value_counts()
for reg in unique_regions.index:
    df[reg] = df['CLIMATE.REGION'].apply(lambda x: 1 if x == reg else 0)

In [6]:
df['CAUSE.CATEGORY']

OBS
1.0              severe weather
2.0          intentional attack
3.0              severe weather
4.0              severe weather
5.0              severe weather
                  ...          
1530.0            public appeal
1531.0    fuel supply emergency
1532.0                islanding
1533.0                islanding
1534.0        equipment failure
Name: CAUSE.CATEGORY, Length: 1534, dtype: object

In [7]:
df1 = df[['OUTAGE.DURATION', 'CUSTOMERS.AFFECTED','ANOMALY.LEVEL', 'MONTH','NERC.REGION','CAUSE.CATEGORY', 'CLIMATE.CATEGORY']]

In [8]:
df1 = df1.dropna()
df1

Unnamed: 0_level_0,OUTAGE.DURATION,CUSTOMERS.AFFECTED,ANOMALY.LEVEL,MONTH,NERC.REGION,CAUSE.CATEGORY,CLIMATE.CATEGORY
OBS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,3060,70000.0,-0.3,7.0,MRO,severe weather,normal
3.0,3000,70000.0,-1.5,10.0,MRO,severe weather,cold
4.0,2550,68200.0,-0.1,6.0,MRO,severe weather,normal
5.0,1740,250000.0,1.2,7.0,MRO,severe weather,warm
6.0,1860,60000.0,-1.4,11.0,MRO,severe weather,cold
...,...,...,...,...,...,...,...
1523.0,95,35000.0,0.3,6.0,WECC,system operability disruption,normal
1524.0,360,0.0,-1.3,1.0,WECC,intentional attack,cold
1525.0,1548,0.0,-0.1,6.0,WECC,public appeal,normal
1527.0,0,0.0,1.6,3.0,WECC,intentional attack,warm


In [9]:
model = LinearRegression()
y = df1[['OUTAGE.DURATION']]
X = df1[['CUSTOMERS.AFFECTED']]
model.fit(X = X, y = y)

LinearRegression()

In [10]:
preproc = ColumnTransformer(
    transformers=[
    ('ohe', OneHotEncoder(), ['MONTH','NERC.REGION','CAUSE.CATEGORY', 'CLIMATE.CATEGORY']),    
#     ('standardise', StandardScaler(), ['CUSTOMERS.AFFECTED', 'ANOMALY.LEVEL'])
    ],
    remainder='passthrough' # Specify what to do with all other columns ('total_bill' here) – drop or passthrough.
)


In [11]:
pl1 = Pipeline([
    ('preproc', preproc),
    ('lin-reg', LinearRegression())
    ])

In [12]:
pl1.fit(df1.drop(['OUTAGE.DURATION'], axis=1), df1['OUTAGE.DURATION'])

Pipeline(steps=[('preproc',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe', OneHotEncoder(),
                                                  ['MONTH', 'NERC.REGION',
                                                   'CAUSE.CATEGORY',
                                                   'CLIMATE.CATEGORY'])])),
                ('lin-reg', LinearRegression())])

In [13]:
pl1.score(df1.drop(['OUTAGE.DURATION'], axis=1), df1['OUTAGE.DURATION'])

0.06859977915851179

In [14]:
def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))

In [15]:
all_preds = model.predict(X)
rmse(df1['OUTAGE.DURATION'], all_preds.reshape(-1))

4282.432892427123

In [16]:
all_preds.reshape(-1)

array([2546.39007736, 2546.39007736, 2539.12560128, ..., 2263.88267403,
       2263.88267403, 2403.11846567])

### Final Model

In [27]:
X_train, X_test, y_train, y_test = train_test_split(df1.drop(['OUTAGE.DURATION'], axis=1), df1['OUTAGE.DURATION'])

In [28]:
preproc = ColumnTransformer(
    transformers=[
    ('ohe', OneHotEncoder(), ['MONTH','NERC.REGION','CAUSE.CATEGORY', 'CLIMATE.CATEGORY']),    
    ('standardise', StandardScaler(), ['CUSTOMERS.AFFECTED', 'ANOMALY.LEVEL'])
    ],
    remainder='drop' # Specify what to do with all other columns ('total_bill' here) – drop or passthrough.
)

In [29]:
pl1 = Pipeline([
    ('preproc', preproc),
    ('lin-reg', LinearRegression())
    ])

In [30]:
pl1.fit(X_train, y_train)

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('ohe', OneHotEncoder(),
                                                  ['MONTH', 'NERC.REGION',
                                                   'CAUSE.CATEGORY',
                                                   'CLIMATE.CATEGORY']),
                                                 ('standardise',
                                                  StandardScaler(),
                                                  ['CUSTOMERS.AFFECTED',
                                                   'ANOMALY.LEVEL'])])),
                ('lin-reg', LinearRegression())])

In [45]:
pl1.score(X_train, y_train)

0.2996835454009322

In [46]:
pl1.score(X_test, y_test)

0.20638375790515606

In [86]:
polyreg = Pipeline([
    ('preproc', preproc),
    ('poly', PolynomialFeatures(1)),
    ('lin-reg', LinearRegression())
    ]
)

In [87]:
polyreg.fit(X_train, y_train)

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('ohe', OneHotEncoder(),
                                                  ['MONTH', 'NERC.REGION',
                                                   'CAUSE.CATEGORY',
                                                   'CLIMATE.CATEGORY']),
                                                 ('standardise',
                                                  StandardScaler(),
                                                  ['CUSTOMERS.AFFECTED',
                                                   'ANOMALY.LEVEL'])])),
                ('poly', PolynomialFeatures(degree=1)),
                ('lin-reg', LinearRegression())])

In [88]:
polyreg.score(X_train, y_train)

0.299683545400932

In [89]:
polyreg.score(X_test, y_test)

0.20638375790081842

### Fairness Analysis

In [None]:
# TODO