# Originally Minh's work

In [18]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

### Cleaning

In [32]:
data = pd.read_csv('outage.csv').iloc[4:]
cols = list(data.iloc[0])[1:] # getting column names
data = data.drop(columns = "Major power outage events in the continental U.S.")
data.columns = cols
data = data.iloc[2:]
data

Unnamed: 0,OBS,YEAR,MONTH,U.S._STATE,POSTAL.CODE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,CLIMATE.CATEGORY,OUTAGE.START.DATE,...,POPPCT_URBAN,POPPCT_UC,POPDEN_URBAN,POPDEN_UC,POPDEN_RURAL,AREAPCT_URBAN,AREAPCT_UC,PCT_LAND,PCT_WATER_TOT,PCT_WATER_INLAND
6,1,2011,7,Minnesota,MN,MRO,East North Central,-0.3,normal,"Friday, July 1, 2011",...,73.27,15.28,2279,1700.5,18.2,2.14,0.6,91.5926658691451,8.40733413085488,5.47874298334407
7,2,2014,5,Minnesota,MN,MRO,East North Central,-0.1,normal,"Sunday, May 11, 2014",...,73.27,15.28,2279,1700.5,18.2,2.14,0.6,91.5926658691451,8.40733413085488,5.47874298334407
8,3,2010,10,Minnesota,MN,MRO,East North Central,-1.5,cold,"Tuesday, October 26, 2010",...,73.27,15.28,2279,1700.5,18.2,2.14,0.6,91.5926658691451,8.40733413085488,5.47874298334407
9,4,2012,6,Minnesota,MN,MRO,East North Central,-0.1,normal,"Tuesday, June 19, 2012",...,73.27,15.28,2279,1700.5,18.2,2.14,0.6,91.5926658691451,8.40733413085488,5.47874298334407
10,5,2015,7,Minnesota,MN,MRO,East North Central,1.2,warm,"Saturday, July 18, 2015",...,73.27,15.28,2279,1700.5,18.2,2.14,0.6,91.5926658691451,8.40733413085488,5.47874298334407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1535,1530,2011,12,North Dakota,ND,MRO,West North Central,-0.9,cold,"Tuesday, December 6, 2011",...,59.9,19.9,2192.2,1868.2,3.9,0.27,0.1,97.5996492121418,2.40176525502843,2.40176525502843
1536,1531,2006,,North Dakota,ND,MRO,West North Central,,,,...,59.9,19.9,2192.2,1868.2,3.9,0.27,0.1,97.5996492121418,2.40176525502843,2.40176525502843
1537,1532,2009,8,South Dakota,SD,RFC,West North Central,0.5,warm,"Saturday, August 29, 2009",...,56.65,26.73,2038.3,1905.4,4.7,0.3,0.15,98.3077441776026,1.69225582239743,1.69225582239743
1538,1533,2009,8,South Dakota,SD,MRO,West North Central,0.5,warm,"Saturday, August 29, 2009",...,56.65,26.73,2038.3,1905.4,4.7,0.3,0.15,98.3077441776026,1.69225582239743,1.69225582239743


In [20]:
data.shape

(1534, 56)

Data has 1534 rows and 56 columns

Let us consider which features interest us. 

Since we are focused on the duration of outages and it's temporal and spatial difference, we are most interested in columns with direct intuitive relation to these factors.

Hence, we will remove the columns: 'OBS', 'HURRICANE.NAMES', 'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE',
       'TOTAL.PRICE', 'RES.SALES', 'COM.SALES', 'IND.SALES', 'TOTAL.SALES',
       'RES.PERCEN', 'COM.PERCEN', 'IND.PERCEN', 'RES.CUSTOMERS',
       'COM.CUSTOMERS', 'IND.CUSTOMERS', 'TOTAL.CUSTOMERS', 'RES.CUST.PCT',
       'COM.CUST.PCT', 'IND.CUST.PCT', 'PC.REALGSP.STATE', 'PC.REALGSP.USA',
       'PC.REALGSP.REL', 'PC.REALGSP.CHANGE', 'UTIL.REALGSP', 'TOTAL.REALGSP',
       'UTIL.CONTRI', 'PI.UTIL.OFUSA', 'POPULATION', 'POPPCT_URBAN',
       'POPPCT_UC', 'POPDEN_URBAN', 'POPDEN_UC', 'POPDEN_RURAL',
       'AREAPCT_URBAN', 'AREAPCT_UC', 'PCT_LAND', 'PCT_WATER_TOT',
       'PCT_WATER_INLAND'


<b>
Additionally
</b>
the detailed cause category will be monitored by us but not used in any EDA or analysis since it varies too much.

In [21]:
data = data.drop(columns = ['OBS', 'HURRICANE.NAMES', 'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE',
       'TOTAL.PRICE', 'RES.SALES', 'COM.SALES', 'IND.SALES', 'TOTAL.SALES',
       'RES.PERCEN', 'COM.PERCEN', 'IND.PERCEN', 'RES.CUSTOMERS',
       'COM.CUSTOMERS', 'IND.CUSTOMERS', 'TOTAL.CUSTOMERS', 'RES.CUST.PCT',
       'COM.CUST.PCT', 'IND.CUST.PCT', 'PC.REALGSP.STATE', 'PC.REALGSP.USA',
       'PC.REALGSP.REL', 'PC.REALGSP.CHANGE', 'UTIL.REALGSP', 'TOTAL.REALGSP',
       'UTIL.CONTRI', 'PI.UTIL.OFUSA', 'POPULATION', 'POPPCT_URBAN',
       'POPPCT_UC', 'POPDEN_URBAN', 'POPDEN_UC', 'POPDEN_RURAL',
       'AREAPCT_URBAN', 'AREAPCT_UC', 'PCT_LAND', 'PCT_WATER_TOT',
       'PCT_WATER_INLAND', 'DEMAND.LOSS.MW', 'CAUSE.CATEGORY.DETAIL', 'U.S._STATE'])

Renaming for better intuitive use

In [22]:
data = data.rename(columns = {'YEAR': 'Year', 'MONTH': 'Month', 'POSTAL.CODE': 'State', 
                       'NERC.REGION': 'NERC Region', 'CLIMATE.REGION': 'Climate Region', 'ANOMALY.LEVEL': 'Anomaly Level', 'CLIMATE.CATEGORY': 'Climate',
                       'OUTAGE.START.DATE': 'Outage Start Date', 'OUTAGE.START.TIME': 'Outage Start Time', 'OUTAGE.RESTORATION.DATE': 'Outage Restoration Date',
                       'OUTAGE.RESTORATION.TIME': 'Outage Restoration Time', 'CAUSE.CATEGORY': 'Cause', 'OUTAGE.DURATION': 'Outage Duration'})

#### Addressing Column data types
Categorical(Non-Ordinal): Year, Month, State, NERC Region, Climate Region, Climate, Cause

Timestamp: Outage Start Date, Outate Start Time, Outage Restoration Date, Outage Restoration Time, Cause

Numerical(Discrete): Outage Duration

Numerical(Continuous): Anomaly level

In [23]:
data = data.sort_values(by = "Year")
categorical_cols = ['Year', 'Month', 'State', 'NERC Region', 'Climate Region', 'Climate', 'Cause']
for col in categorical_cols:
    data[col] = pd.Categorical(data[col])

timestamp_cols = ['Outage Start Date', 'Outage Start Time',
                  'Outage Restoration Date', 'Outage Restoration Time']
for col in timestamp_cols:
    data[col] = pd.to_datetime(data[col], errors='coerce')

data['Outage Start Time'] = pd.to_datetime(data['Outage Start Time'], format='%I:%M:%S %p', errors='coerce').dt.time
data['Outage Restoration Time'] = pd.to_datetime(data['Outage Restoration Time'], format='%I:%M:%S %p', errors='coerce').dt.time

  data[col] = pd.to_datetime(data[col], errors='coerce')
  data[col] = pd.to_datetime(data[col], errors='coerce')


In [24]:
data['Outage Duration'] = pd.to_numeric(data['Outage Duration'])
data['Anomaly Level'] = pd.to_numeric(data['Anomaly Level'])

In [25]:
data

Unnamed: 0,Year,Month,State,NERC Region,Climate Region,Anomaly Level,Climate,Outage Start Date,Outage Start Time,Outage Restoration Date,Outage Restoration Time,Cause,Outage Duration
772,2000,,NC,SERC,Southeast,,,NaT,NaT,NaT,NaT,severe weather,
241,2000,3,TX,TRE,South,-1.1,cold,2000-03-18,16:00:00,2000-03-18,17:10:00,system operability disruption,70.0
245,2000,,TX,FRCC,South,,,NaT,NaT,NaT,NaT,equipment failure,
307,2000,8,IN,ECAR,Central,-0.5,cold,2000-08-28,23:00:00,NaT,NaT,equipment failure,
345,2000,,AL,SERC,Southeast,,,NaT,NaT,NaT,NaT,severe weather,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,2016,7,PA,RFC,Northeast,-0.3,normal,2016-07-23,15:15:00,2016-07-23,19:53:00,system operability disruption,278.0
749,2016,7,NC,SERC,Southeast,-0.3,normal,2016-07-08,20:50:00,2016-07-09,19:25:00,severe weather,1355.0
663,2016,5,UT,WECC,Southwest,0.6,warm,2016-05-19,21:36:00,2016-05-20,01:00:00,system operability disruption,204.0
661,2016,1,UT,WECC,Southwest,2.2,warm,2016-01-17,12:00:00,2016-01-17,13:00:00,intentional attack,60.0


#### Missing values

In [26]:
missing = data.isna().sum(axis = 0)
missing_cols = {row: int(missing[row]) for row in missing.index if missing[row] > 0}
missing_cols

{'Month': 9,
 'Climate Region': 6,
 'Anomaly Level': 9,
 'Climate': 9,
 'Outage Start Date': 9,
 'Outage Start Time': 9,
 'Outage Restoration Date': 58,
 'Outage Restoration Time': 58,
 'Outage Duration': 58}

In [27]:
data[data['Year'] == '2000']

Unnamed: 0,Year,Month,State,NERC Region,Climate Region,Anomaly Level,Climate,Outage Start Date,Outage Start Time,Outage Restoration Date,Outage Restoration Time,Cause,Outage Duration
772,2000,,NC,SERC,Southeast,,,NaT,NaT,NaT,NaT,severe weather,
241,2000,3.0,TX,TRE,South,-1.1,cold,2000-03-18,16:00:00,2000-03-18,17:10:00,system operability disruption,70.0
245,2000,,TX,FRCC,South,,,NaT,NaT,NaT,NaT,equipment failure,
307,2000,8.0,IN,ECAR,Central,-0.5,cold,2000-08-28,23:00:00,NaT,NaT,equipment failure,
345,2000,,AL,SERC,Southeast,,,NaT,NaT,NaT,NaT,severe weather,
346,2000,12.0,AL,SERC,Southeast,-0.8,cold,2000-12-16,11:36:00,2000-12-18,18:00:00,severe weather,3264.0
347,2000,8.0,AL,SERC,Southeast,-0.5,cold,2000-08-10,21:30:00,2000-08-11,18:00:00,severe weather,1230.0
356,2000,5.0,IL,SERC,Central,-0.7,cold,2000-05-18,18:00:00,NaT,NaT,severe weather,
371,2000,,IL,SERC,Central,,,NaT,NaT,NaT,NaT,severe weather,
387,2000,8.0,IL,SERC,Central,-0.5,cold,2000-08-06,16:00:00,2000-08-07,12:00:00,severe weather,1200.0


### EDA

In [28]:
year_duration = go.Figure()

for year in data['Year'].unique():
    year_data = data[data['Year'] == year]
    year_duration.add_trace(go.Scatter(
        x=year_data['State'],
        y=year_data['Outage Duration'],
        mode='markers',
        name=year,
        marker=dict(size=6),
        opacity=0.7
    ))

year_duration.update_layout(
    title='Outage Duration by State (Colored by Year)',
    xaxis_title='State',
    yaxis_title='Outage Duration',
    showlegend=True
)

year_duration.show()

In [29]:
month_duration = go.Figure()

for month in data['Month'].unique():
    month_data = data[data['Month'] == month]
    month_duration.add_trace(go.Scatter(
        x=month_data['State'],
        y=month_data['Outage Duration'],
        mode='markers',
        name=f'Month {month}',
        marker=dict(size=6),
        opacity=0.7
    ))

month_duration.update_layout(
    title='Outage Duration by State (Colored by Month)',
    xaxis_title='State',
    yaxis_title='Outage Duration',
    showlegend=True
)

month_duration.show()


In [30]:
climate_fig = go.Figure()

for region in data['Climate Region'].unique():
    region_data = data[data['Climate Region'] == region]
    climate_fig.add_trace(go.Scatter(
        x=region_data['Year'],
        y=region_data['Outage Duration'],
        mode='markers',
        name=region,
        marker=dict(size=6),
        opacity=0.7
    ))

climate_fig.update_layout(
    title='Outage Duration vs Year (Colored by Climate Region)',
    xaxis_title='Year',
    yaxis_title='Outage Duration',
    showlegend=True
)

climate_fig.show()