# Cleaning Data

In [1]:
import pandas as pd



## Load the Internet Exchange Points Data

Source: Internet Society

In [2]:
IXP_by_City = pd.read_csv("/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Data/IXPs_by_City.csv",
                          sep=',',           
                          quotechar='"',     
                          skipinitialspace=True)
IXP_by_City

Unnamed: 0,IXP Name,Location
0,48 IX,"Phoenix, AZ"
1,ABQIX,"Albuquerque, NM"
2,AlaskaIX,"Anchorage, AK"
3,Amateur Radio Internet Exchange - ARIX,"Fremont, CA"
4,Amateur Radio Internet Exchange - ARIX,"Portland, OR"
...,...,...
194,Twin Ports Internet Exchange - TP-IX,"Duluth, MN"
195,West Virginia Internet Exchange,"South Charleston, WV"
196,Willamette Internet Exchange,"Eugene, OR"
197,Yellowstone Regional Internet eXchange - YRIX,"Billings, MT"


In [3]:
IXP_by_City.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   IXP Name  199 non-null    object
 1   Location  199 non-null    object
dtypes: object(2)
memory usage: 3.2+ KB


In [4]:
IXP_by_City['Location'] = IXP_by_City['Location'].str.strip()

Split the Location column into City and State columns

In [5]:
IXP_by_City[['City', 'State']] = IXP_by_City['Location'].str.rsplit(',', n=1, expand=True)
IXP_by_City


Unnamed: 0,IXP Name,Location,City,State
0,48 IX,"Phoenix, AZ",Phoenix,AZ
1,ABQIX,"Albuquerque, NM",Albuquerque,NM
2,AlaskaIX,"Anchorage, AK",Anchorage,AK
3,Amateur Radio Internet Exchange - ARIX,"Fremont, CA",Fremont,CA
4,Amateur Radio Internet Exchange - ARIX,"Portland, OR",Portland,OR
...,...,...,...,...
194,Twin Ports Internet Exchange - TP-IX,"Duluth, MN",Duluth,MN
195,West Virginia Internet Exchange,"South Charleston, WV",South Charleston,WV
196,Willamette Internet Exchange,"Eugene, OR",Eugene,OR
197,Yellowstone Regional Internet eXchange - YRIX,"Billings, MT",Billings,MT


In [6]:
# Clean up any whitespace spaces in the new columns
IXP_by_City['City'] = IXP_by_City['City'].str.strip()
IXP_by_City['State'] = IXP_by_City['State'].str.strip()

IXP_by_City.head()

Unnamed: 0,IXP Name,Location,City,State
0,48 IX,"Phoenix, AZ",Phoenix,AZ
1,ABQIX,"Albuquerque, NM",Albuquerque,NM
2,AlaskaIX,"Anchorage, AK",Anchorage,AK
3,Amateur Radio Internet Exchange - ARIX,"Fremont, CA",Fremont,CA
4,Amateur Radio Internet Exchange - ARIX,"Portland, OR",Portland,OR


In [7]:
#Drop the Location column
IXP_by_City = IXP_by_City.drop(columns=['Location'])
IXP_by_City.head()


Unnamed: 0,IXP Name,City,State
0,48 IX,Phoenix,AZ
1,ABQIX,Albuquerque,NM
2,AlaskaIX,Anchorage,AK
3,Amateur Radio Internet Exchange - ARIX,Fremont,CA
4,Amateur Radio Internet Exchange - ARIX,Portland,OR


In [8]:
pd.set_option('display.max_rows', None)

In [9]:
pd.reset_option('display.max_rows', 10)
IXP_by_City

Unnamed: 0,IXP Name,City,State
0,48 IX,Phoenix,AZ
1,ABQIX,Albuquerque,NM
2,AlaskaIX,Anchorage,AK
3,Amateur Radio Internet Exchange - ARIX,Fremont,CA
4,Amateur Radio Internet Exchange - ARIX,Portland,OR
...,...,...,...
194,Twin Ports Internet Exchange - TP-IX,Duluth,MN
195,West Virginia Internet Exchange,South Charleston,WV
196,Willamette Internet Exchange,Eugene,OR
197,Yellowstone Regional Internet eXchange - YRIX,Billings,MT


In [10]:
IXP_by_City.to_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Data/Cleaned_IXP.csv')

# Load in State Emissions Data

In [11]:
emissions = pd.read_csv("/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Data/EPA_State_Emissions_2023.csv")

emissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 13 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   REPORTING YEAR                   7511 non-null   int64  
 1   FACILITY NAME                    7511 non-null   object 
 2   GHGRP ID                         7511 non-null   int64  
 3   REPORTED ADDRESS                 7030 non-null   object 
 4   LATITUDE                         7511 non-null   float64
 5   LONGITUDE                        7511 non-null   float64
 6   CITY                             7511 non-null   object 
 7   COUNTY                           6710 non-null   object 
 8   STATE                            7511 non-null   object 
 9   ZIP CODE                         7511 non-null   int64  
 10  PARENT COMPANIES                 7510 non-null   object 
 11  GHG QUANTITY (METRIC TONS CO2e)  7511 non-null   int64  
 12  SUBPARTS            

This dataset shows the Greenhouse Gas Missions from Large facilities broken down by city, county, and state.

In [12]:
emissions.head()

Unnamed: 0,REPORTING YEAR,FACILITY NAME,GHGRP ID,REPORTED ADDRESS,LATITUDE,LONGITUDE,CITY,COUNTY,STATE,ZIP CODE,PARENT COMPANIES,GHG QUANTITY (METRIC TONS CO2e),SUBPARTS
0,2023,(430) Civitas Resources - Permian Basin,1014785,555 17th St. Suite 3700,39.74522,-104.989197,Denver,,CO,80202,CIVITAS RESOURCES INC (100%),844548,W
1,2023,(540) Civitas Resources - Denver Basin,1000355,555 17th St. Suite 3700,39.74431,-104.98858,Denver,,CO,80202,CIVITAS RESOURCES INC (100%),887487,W
2,2023,(540) Civitas Resources - GB - Denver Basin,1014558,555 17th St. Suite 3700,39.745822,-104.989243,Denver,,CO,80202,CIVITAS RESOURCES INC (100%),110747,W
3,2023,121 REGIONAL DISPOSAL FACILITY,1004377,3820 SAM RAYBURN HIGHWAY,33.29857,-96.53586,MELISSA,COLLIN COUNTY,TX,75454,NORTH TEXAS MUNICIPAL WATER DISTRICT (100%),288302,HH
4,2023,15-18565/15-18662,1010040,1021 Tori Drive,37.274127,-83.239034,Hazard,PERRY COUNTY,KY,40701,CAMBRIAN COAL LLC (100%),122327,FF


In [13]:
emissions['COUNTY'].fillna('Unknown', inplace=True)
emissions.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  emissions['COUNTY'].fillna('Unknown', inplace=True)


Unnamed: 0,REPORTING YEAR,FACILITY NAME,GHGRP ID,REPORTED ADDRESS,LATITUDE,LONGITUDE,CITY,COUNTY,STATE,ZIP CODE,PARENT COMPANIES,GHG QUANTITY (METRIC TONS CO2e),SUBPARTS
0,2023,(430) Civitas Resources - Permian Basin,1014785,555 17th St. Suite 3700,39.74522,-104.989197,Denver,Unknown,CO,80202,CIVITAS RESOURCES INC (100%),844548,W
1,2023,(540) Civitas Resources - Denver Basin,1000355,555 17th St. Suite 3700,39.74431,-104.98858,Denver,Unknown,CO,80202,CIVITAS RESOURCES INC (100%),887487,W
2,2023,(540) Civitas Resources - GB - Denver Basin,1014558,555 17th St. Suite 3700,39.745822,-104.989243,Denver,Unknown,CO,80202,CIVITAS RESOURCES INC (100%),110747,W
3,2023,121 REGIONAL DISPOSAL FACILITY,1004377,3820 SAM RAYBURN HIGHWAY,33.29857,-96.53586,MELISSA,COLLIN COUNTY,TX,75454,NORTH TEXAS MUNICIPAL WATER DISTRICT (100%),288302,HH
4,2023,15-18565/15-18662,1010040,1021 Tori Drive,37.274127,-83.239034,Hazard,PERRY COUNTY,KY,40701,CAMBRIAN COAL LLC (100%),122327,FF


In [14]:
emissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 13 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   REPORTING YEAR                   7511 non-null   int64  
 1   FACILITY NAME                    7511 non-null   object 
 2   GHGRP ID                         7511 non-null   int64  
 3   REPORTED ADDRESS                 7030 non-null   object 
 4   LATITUDE                         7511 non-null   float64
 5   LONGITUDE                        7511 non-null   float64
 6   CITY                             7511 non-null   object 
 7   COUNTY                           7511 non-null   object 
 8   STATE                            7511 non-null   object 
 9   ZIP CODE                         7511 non-null   int64  
 10  PARENT COMPANIES                 7510 non-null   object 
 11  GHG QUANTITY (METRIC TONS CO2e)  7511 non-null   int64  
 12  SUBPARTS            

In [15]:
emissions['PARENT COMPANIES'] = emissions['PARENT COMPANIES'].str[:-6]
emissions['PARENT COMPANIES'] = emissions['PARENT COMPANIES'].str.strip()
emissions.head()



Unnamed: 0,REPORTING YEAR,FACILITY NAME,GHGRP ID,REPORTED ADDRESS,LATITUDE,LONGITUDE,CITY,COUNTY,STATE,ZIP CODE,PARENT COMPANIES,GHG QUANTITY (METRIC TONS CO2e),SUBPARTS
0,2023,(430) Civitas Resources - Permian Basin,1014785,555 17th St. Suite 3700,39.74522,-104.989197,Denver,Unknown,CO,80202,CIVITAS RESOURCES INC,844548,W
1,2023,(540) Civitas Resources - Denver Basin,1000355,555 17th St. Suite 3700,39.74431,-104.98858,Denver,Unknown,CO,80202,CIVITAS RESOURCES INC,887487,W
2,2023,(540) Civitas Resources - GB - Denver Basin,1014558,555 17th St. Suite 3700,39.745822,-104.989243,Denver,Unknown,CO,80202,CIVITAS RESOURCES INC,110747,W
3,2023,121 REGIONAL DISPOSAL FACILITY,1004377,3820 SAM RAYBURN HIGHWAY,33.29857,-96.53586,MELISSA,COLLIN COUNTY,TX,75454,NORTH TEXAS MUNICIPAL WATER DISTRICT,288302,HH
4,2023,15-18565/15-18662,1010040,1021 Tori Drive,37.274127,-83.239034,Hazard,PERRY COUNTY,KY,40701,CAMBRIAN COAL LLC,122327,FF


In [16]:
#There are 52 unique states, this must include territories
len(emissions['STATE'].value_counts().unique())

52

In [17]:
emissions['STATE'].value_counts()

STATE
TX    1307
LA     410
CA     407
PA     328
OK     275
OH     269
MI     226
CO     222
IL     222
NY     217
IN     198
FL     182
AL     172
GA     170
IA     159
KY     140
VA     135
MN     134
WI     133
NC     130
TN     129
WV     128
MS     117
MO     111
KS     110
WA     105
NM      99
AR      98
SC      97
AZ      94
NJ      85
UT      80
NE      77
ND      77
AK      71
WY      69
MA      68
OR      58
MD      54
NV      45
CT      41
SD      39
ID      37
MT      32
ME      29
HI      27
PR      23
DE      20
NH      17
RI      11
GU       9
VT       7
DC       6
VI       5
Name: count, dtype: int64

Texas has the most reported facilities, over 1000 

In [18]:
emissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 13 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   REPORTING YEAR                   7511 non-null   int64  
 1   FACILITY NAME                    7511 non-null   object 
 2   GHGRP ID                         7511 non-null   int64  
 3   REPORTED ADDRESS                 7030 non-null   object 
 4   LATITUDE                         7511 non-null   float64
 5   LONGITUDE                        7511 non-null   float64
 6   CITY                             7511 non-null   object 
 7   COUNTY                           7511 non-null   object 
 8   STATE                            7511 non-null   object 
 9   ZIP CODE                         7511 non-null   int64  
 10  PARENT COMPANIES                 7510 non-null   object 
 11  GHG QUANTITY (METRIC TONS CO2e)  7511 non-null   int64  
 12  SUBPARTS            

In [19]:
# Make sure they are object types
emissions[['FACILITY NAME','CITY','COUNTY','PARENT COMPANIES']] = emissions[['FACILITY NAME','CITY','COUNTY','PARENT COMPANIES']].astype(object)


In [20]:
emissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 13 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   REPORTING YEAR                   7511 non-null   int64  
 1   FACILITY NAME                    7511 non-null   object 
 2   GHGRP ID                         7511 non-null   int64  
 3   REPORTED ADDRESS                 7030 non-null   object 
 4   LATITUDE                         7511 non-null   float64
 5   LONGITUDE                        7511 non-null   float64
 6   CITY                             7511 non-null   object 
 7   COUNTY                           7511 non-null   object 
 8   STATE                            7511 non-null   object 
 9   ZIP CODE                         7511 non-null   int64  
 10  PARENT COMPANIES                 7510 non-null   object 
 11  GHG QUANTITY (METRIC TONS CO2e)  7511 non-null   int64  
 12  SUBPARTS            

In [21]:
emissions['REPORTED ADDRESS'].isna().value_counts()

REPORTED ADDRESS
False    7030
True      481
Name: count, dtype: int64

In [22]:
emissions[emissions['REPORTED ADDRESS'].isna()]

Unnamed: 0,REPORTING YEAR,FACILITY NAME,GHGRP ID,REPORTED ADDRESS,LATITUDE,LONGITUDE,CITY,COUNTY,STATE,ZIP CODE,PARENT COMPANIES,GHG QUANTITY (METRIC TONS CO2e),SUBPARTS
24,2023,3M CO,1005043,,44.951000,-92.995900,MAPLEWOOD,RAMSEY COUNTY,MN,55144,3M CO,77909,C
63,2023,AC 857 A,1002685,,26.295556,-94.897778,Offshore,Unknown,TX,0,SHELL PETROLEUM INC,180279,"C,W"
123,2023,"AIR PRODUCTS & CHEMICALS, INC, Tesoro Martinez",1002122,,38.026667,-122.067222,MARTINEZ,CONTRA COSTA COUNTY,CA,94553,AIR PRODUCTS & CHEMICALS INC,34812,P
129,2023,AIR PRODUCTS TEXAS CITY FACILITY,1014807,,29.381111,-94.899672,Texas City,GALVESTON COUNTY,TX,77590,AIR PRODUCTS & CHEMICALS INC,41289,P
141,2023,ALCOA INC - WARRICK OPERATIONS,1001877,,37.915000,-87.332800,NEWBURGH,WARRICK COUNTY,IN,47629,ALCOA CORP,4867519,"C,D,F"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7380,2023,White Oak Resources Mine No. 1/Hamilton County...,1011237,,38.171500,-88.602500,Dahlgren,HAMILTON COUNTY,IL,62828,ALLIANCE HOLDINGS GP LP,742289,FF
7390,2023,Who Dat Production Facility (Mississippi Canyo...,1010584,,28.245500,-89.005800,Offshore,Unknown,LA,0,LLOG EXPLORATION OFFSHORE LLC,64602,"C,W"
7393,2023,Wiggins Facility,1010300,,32.230000,-93.345300,Loggy Bayou,RED RIVER PARISH,LA,71051,ENERGY TRANSFER LP,56813,"C,W"
7493,2023,Yoakum Cryogenic Plant,1009093,,29.342100,-97.110000,Yoakum,LAVACA COUNTY,TX,77995,ENTERPRISE PRODUCTS PARTNERS LP,677366,"C,W"


In [23]:
# Drop unnecessary columns
emissions = emissions.drop(columns=['GHGRP ID','REPORTED ADDRESS','SUBPARTS'])


In [24]:
emissions['PARENT COMPANIES'].isna().value_counts()

PARENT COMPANIES
False    7510
True        1
Name: count, dtype: int64

In [25]:
emissions[emissions['PARENT COMPANIES'].isna()]



Unnamed: 0,REPORTING YEAR,FACILITY NAME,LATITUDE,LONGITUDE,CITY,COUNTY,STATE,ZIP CODE,PARENT COMPANIES,GHG QUANTITY (METRIC TONS CO2e)
3717,2023,Kaiser-Francis Oil Company 430 Permian Basin G...,36.065483,-95.92207,Tulsa,TULSA COUNTY,OK,74136,,33494


In [26]:
emissions['PARENT COMPANIES'].fillna('Unknown', inplace=True)
emissions['PARENT COMPANIES'].isna().value_counts()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  emissions['PARENT COMPANIES'].fillna('Unknown', inplace=True)


PARENT COMPANIES
False    7511
Name: count, dtype: int64

In [27]:
emissions.to_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Emissions.csv')

## Loading the Electric Power Reliability Metrics Using IEEE of Distribution Systems by State for 2023 and 2022

Source: U.S. Energy Information Administration, Form EIA-861, Annual Electric Power Industry Report.

In [28]:
reliability_distribution = pd.read_csv("/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Data/Power_Reliability_Metrics_Distribution_System.csv")
reliability_distribution.head()


Unnamed: 0,Census Division,State,Percent of Customers Reported w/ Major Events (2023),Percent of Customers Reported w/ Major Events (2022),SAIDI w/ Major Events (2023),SAIDI w/ Major Events (2022),SAIFI w/ Major Events (2023),SAIFI w/ Major Events (2022),CAIDI w/ Major Events (2023),CAIDI w/ Major Events (2022),SAIDI w/o Major Events (2023),SAIDI w/o Major Events (2022),SAIFI w/o Major Events (2023),SAIFI w/o Major Events (2022),CAIDI w/o Major Events (2023),CAIDI w/o Major Events (2022)
0,New England,Connecticut,100.6%,100.2%,164.6,157.4,0.9,0.8,188.8,191.8,70.3,66.1,0.7,0.6,107.3,106.4
1,New England,Maine,100.8%,102.7%,1863.0,963.6,3.3,3.0,562.2,321.1,247.4,216.7,2.0,2.0,125.0,110.8
2,New England,Massachusetts,88.2%,93.1%,259.4,163.2,1.1,1.1,245.4,151.0,82.2,83.0,0.8,0.9,105.4,93.8
3,New England,New Hampshire,99.3%,99.3%,645.8,616.7,1.8,1.8,368.5,345.9,124.6,107.5,1.0,0.9,126.1,115.6
4,New England,Rhode Island,98.3%,98.2%,104.6,81.9,0.8,1.0,133.7,85.4,52.2,63.2,0.7,0.8,78.3,78.2


- SAIDI = System Average Interruption Duration Index. It is the minutes of non-momentary electric interruptions, per year, the average customer experienced.

- SAIFI = System Average Interruption Frequency Index. It is the number of non-momentary electric interruptions, per year, the average customer experienced.

- CAIDI = Customer Average Interruption Duration Index. It is average number of minutes it takes to restore non-momentary electric interruptions.
- IEEE refers to the IEEE 1366-2003 or the IEEE 1366-2012 standard.

- A Major Event Day is any day that exceeds a daily SAIDI threshold called Tmed. Tmed is a duration statistic calculated from daily SAIDI values from the past five years.

- Loss of Supply Removed excludes outages due to loss of supply from the high-voltage/bulk power system.

- Percent of Customers Reported is an estimate of the percentage of total customers covered by these metrics. The numerator is reported number of meters used on the reliability schedule, while the denominator is the number of customers reported on the sales to ultimate customers schedule. It is possible, in some instances, for this metric to exceed 100%.

In [29]:
reliability_distribution.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 16 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Census Division                                       51 non-null     object 
 1   State                                                 51 non-null     object 
 2   Percent of Customers Reported w/ Major Events (2023)  51 non-null     object 
 3   Percent of Customers Reported w/ Major Events (2022)  51 non-null     object 
 4   SAIDI w/ Major Events (2023)                          51 non-null     object 
 5   SAIDI w/ Major Events (2022)                          51 non-null     object 
 6   SAIFI w/ Major Events (2023)                          51 non-null     float64
 7   SAIFI w/ Major Events (2022)                          51 non-null     float64
 8   CAIDI w/ Major Events (2023)                          51 non-n

In [30]:
# replace state names with abbreviations with the dictionary below:

us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "Virgin Islands, U.S.": "VI",
}

In [31]:
reliability_distribution['State'] = reliability_distribution['State'].replace(us_state_to_abbrev)
reliability_distribution['State'].value_counts()


State
CT    1
AZ    1
VA    1
WV    1
AL    1
KY    1
MS    1
TN    1
AR    1
LA    1
OK    1
TX    1
CO    1
NC    1
ID    1
MT    1
NV    1
NM    1
UT    1
WY    1
CA    1
OR    1
WA    1
AK    1
SC    1
MD    1
ME    1
OH    1
MA    1
NH    1
RI    1
VT    1
NJ    1
NY    1
PA    1
IL    1
IN    1
MI    1
WI    1
GA    1
IA    1
KS    1
MN    1
MO    1
NE    1
ND    1
SD    1
DE    1
DC    1
FL    1
HI    1
Name: count, dtype: int64

In [32]:
reliability_distribution.columns = reliability_distribution.columns.str.strip()

In [33]:
reliability_distribution = reliability_distribution.drop(columns=['Percent of Customers Reported w/ Major Events (2022)',
                                                                  'SAIDI w/ Major Events (2022)',
                                                                  'SAIFI w/ Major Events (2022)',
                                                                  'CAIDI w/ Major Events (2022)',
                                                                  'SAIDI w/o Major Events (2022)',
                                                                  'SAIFI w/o Major Events (2022)',
                                                                  'CAIDI w/o Major Events (2022)'])
reliability_distribution.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 9 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Census Division                                       51 non-null     object 
 1   State                                                 51 non-null     object 
 2   Percent of Customers Reported w/ Major Events (2023)  51 non-null     object 
 3   SAIDI w/ Major Events (2023)                          51 non-null     object 
 4   SAIFI w/ Major Events (2023)                          51 non-null     float64
 5   CAIDI w/ Major Events (2023)                          51 non-null     float64
 6   SAIDI w/o Major Events (2023)                         51 non-null     float64
 7   SAIFI w/o Major Events (2023)                         51 non-null     float64
 8   CAIDI w/o Major Events (2023)                         51 non-nu

In [34]:
cols_to_strip = reliability_distribution.columns[[3, 4, 5, 6, 7, 8]]
cols_to_strip

Index(['SAIDI w/ Major Events (2023)', 'SAIFI w/ Major Events (2023)',
       'CAIDI w/ Major Events (2023)', 'SAIDI w/o Major Events (2023)',
       'SAIFI w/o Major Events (2023)', 'CAIDI w/o Major Events (2023)'],
      dtype='object')

In [35]:
reliability_distribution = reliability_distribution.rename(columns={'SAIDI w/ Major Events (2023)':'SAIDI w/ Major Events',
                                                                    'SAIFI w/ Major Events (2023)': 'SAIFI w/ Major Events',
                                                                    'CAIDI w/ Major Events (2023)':'CAIDI w/ Major Events',
                                                                    'SAIDI w/o Major Events (2023)':'SAIDI w/o Major Events',
                                                                    'SAIFI w/o Major Events (2023)':'SAIFI w/o Major Events',
                                                                    'CAIDI w/o Major Events (2023)':'CAIDI w/o Major Events',
                                                                    'Percent of Customers Reported w/ Major Events (2023)':'Percent of Customers Reported w/ Major Events'})
reliability_distribution.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 9 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Census Division                                51 non-null     object 
 1   State                                          51 non-null     object 
 2   Percent of Customers Reported w/ Major Events  51 non-null     object 
 3   SAIDI w/ Major Events                          51 non-null     object 
 4   SAIFI w/ Major Events                          51 non-null     float64
 5   CAIDI w/ Major Events                          51 non-null     float64
 6   SAIDI w/o Major Events                         51 non-null     float64
 7   SAIFI w/o Major Events                         51 non-null     float64
 8   CAIDI w/o Major Events                         51 non-null     float64
dtypes: float64(5), object(4)
memory usage: 3.7+ KB


In [36]:
reliability_distribution['Percent of Customers Reported w/ Major Events']

0     100.6%
1     100.8%
2      88.2%
3      99.3%
4      98.3%
5      87.0%
6     101.0%
7      97.7%
8      94.4%
9      97.8%
10     91.1%
11     97.4%
12     96.5%
13     92.5%
14     85.2%
15     82.5%
16     88.7%
17     87.6%
18     73.6%
19     89.0%
20     74.3%
21     89.0%
22     98.9%
23    100.2%
24     89.5%
25     98.7%
26     96.1%
27     95.6%
28     96.8%
29     97.8%
30     80.0%
31     96.6%
32     83.6%
33     87.0%
34     89.3%
35     93.6%
36     90.0%
37     96.9%
38     96.7%
39     91.8%
40     94.9%
41     75.0%
42    102.7%
43     90.1%
44     88.4%
45     75.9%
46    100.2%
47     92.9%
48     95.1%
49     84.5%
50     99.9%
Name: Percent of Customers Reported w/ Major Events, dtype: object

In [37]:
# Remove the % sign
reliability_distribution['Percent of Customers Reported w/ Major Events'] = reliability_distribution['Percent of Customers Reported w/ Major Events'].str[:-1]

In [38]:
# Convert to float
reliability_distribution['Percent of Customers Reported w/ Major Events'] = reliability_distribution['Percent of Customers Reported w/ Major Events'].astype(float)
reliability_distribution.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 9 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Census Division                                51 non-null     object 
 1   State                                          51 non-null     object 
 2   Percent of Customers Reported w/ Major Events  51 non-null     float64
 3   SAIDI w/ Major Events                          51 non-null     object 
 4   SAIFI w/ Major Events                          51 non-null     float64
 5   CAIDI w/ Major Events                          51 non-null     float64
 6   SAIDI w/o Major Events                         51 non-null     float64
 7   SAIFI w/o Major Events                         51 non-null     float64
 8   CAIDI w/o Major Events                         51 non-null     float64
dtypes: float64(6), object(3)
memory usage: 3.7+ KB


In [39]:
# Remove commas and convert to float

reliability_distribution['SAIDI w/ Major Events'] = reliability_distribution['SAIDI w/ Major Events'].str.replace(',', '').astype(float)
reliability_distribution['SAIDI w/ Major Events']

0      164.6
1     1863.0
2      259.4
3      645.8
4      104.6
5      744.4
6      108.3
7      120.0
8      252.7
9      208.2
10     455.4
11    1093.6
12     366.2
13     183.0
14     104.9
15     283.1
16     126.4
17     371.8
18      72.6
19     208.6
20      77.0
21     108.2
22      71.9
23     160.1
24     349.2
25     166.2
26     252.0
27     167.2
28     221.1
29     751.5
30     294.2
31     868.2
32     802.1
33     857.9
34     911.2
35     584.2
36     896.6
37     496.2
38     106.8
39      99.2
40     137.5
41     118.7
42     138.7
43     168.9
44     127.1
45     127.0
46     346.6
47     126.9
48     151.7
49     374.8
50     491.8
Name: SAIDI w/ Major Events, dtype: float64

In [40]:
reliability_distribution.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 9 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Census Division                                51 non-null     object 
 1   State                                          51 non-null     object 
 2   Percent of Customers Reported w/ Major Events  51 non-null     float64
 3   SAIDI w/ Major Events                          51 non-null     float64
 4   SAIFI w/ Major Events                          51 non-null     float64
 5   CAIDI w/ Major Events                          51 non-null     float64
 6   SAIDI w/o Major Events                         51 non-null     float64
 7   SAIFI w/o Major Events                         51 non-null     float64
 8   CAIDI w/o Major Events                         51 non-null     float64
dtypes: float64(7), object(2)
memory usage: 3.7+ KB


The key column it will be joined on is State column

In [41]:
reliability_distribution.to_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Reliability_Distribution.csv')

# Load the Energy Efficiency Data


In [42]:
efficiency_df = pd.read_csv("/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Data/Energy_Efficiency_2023_clean.csv")
efficiency_df.head()


Unnamed: 0,Data Year,Utility Number,Utility Name,State,BA Code,Commercial Annual Savings (MWh),Industrial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Industrial Peak Demand Annual Savings (MW),...,Industrial Non-Incentive Annual Costs (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Industrial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Industrial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs),Industrial Weighted Avg Life (Yrs)
0,2023,162.0,Aiken Electric Coop Inc,SC,SC,.,.,206,.,.,...,.,37,.,.,.,.,.,348,.,.
1,2023,189.0,PowerSouth Energy Cooperative,AL,SOCO,.,.,2550,.,.,...,.,526,.,.,848,.,.,526,.,.
2,2023,195.0,Alabama Power Co,AL,SOCO,759,.,5767,0.2,.,...,.,1095,36,.,1225,151,.,1095,5.000,.
3,2023,207.0,Alameda Municipal Power,CA,CISO,111,0,146,0.0,0.0,...,0,321,23,0,70,146,0,321,13.269,0.000
4,2023,295.0,City of Alexandria - (MN),MN,MISO,673,2386,3306,0.1,0.5,...,134,194,36,129,187,38,134,194,12.727,12.727


## Important features:
- Peak Demand Savings: reduction in power demand, grid capacity, reliability
- Annual Savings: overall energy efficiency
- Weighted Average Life: sustainability/longevity of efficiency measures
- BA Code: Balancing Authority Code 

In [43]:
efficiency_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 31 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Data Year                                           458 non-null    object 
 1   Utility Number                                      457 non-null    float64
 2   Utility Name                                        457 non-null    object 
 3   State                                               457 non-null    object 
 4   BA Code                                             451 non-null    object 
 5   Commercial Annual Savings (MWh)                     457 non-null    object 
 6   Industrial Annual Savings (MWh)                     457 non-null    object 
 7   Total Annual Savings (MWh)                          457 non-null    object 
 8   Commercial Peak Demand Annual Savings (MW)          457 non-null    object 
 9  

In [44]:
import numpy as np
# Renaming Data Year column
efficiency_df = efficiency_df.rename(columns={'Data Year':'Year'})
# Replace '.' with NA
efficiency_df = efficiency_df.replace('.', np.nan)

In [45]:
efficiency_df.tail(5)


Unnamed: 0,Year,Utility Number,Utility Name,State,BA Code,Commercial Annual Savings (MWh),Industrial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Industrial Peak Demand Annual Savings (MW),...,Industrial Non-Incentive Annual Costs (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Industrial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Industrial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs),Industrial Weighted Avg Life (Yrs)
453,2023,60631.0,Upper Michigan Energy Resources Corp.,MI,MISO,,,,,,...,,,,,,,,,,
454,2023,60868.0,Redwood Coast Energy Authority,CA,CISO,120.0,22.0,170.0,0.0,0.0,...,32.0,243.0,120.0,22.0,188.0,167.0,32.0,243.0,12.028,12.0
455,2023,61858.0,San Jose Clean Energy,CA,CISO,2582.0,,2611.0,0.5,,...,,781.0,856.0,,908.0,657.0,,781.0,0.1,
456,2023,65352.0,Umpqua Indian Utility Cooperative,OR,BPAT,67.0,,67.0,,,...,,28.0,40.0,,40.0,,,,10.0,
457,NM - Data is Not Meaningful.,,,,,,,,,,...,,,,,,,,,,


In [46]:
# Drop the last row with no data
efficiency_df = efficiency_df.drop(index=457)
efficiency_df.tail(5)



Unnamed: 0,Year,Utility Number,Utility Name,State,BA Code,Commercial Annual Savings (MWh),Industrial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Industrial Peak Demand Annual Savings (MW),...,Industrial Non-Incentive Annual Costs (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Industrial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Industrial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs),Industrial Weighted Avg Life (Yrs)
452,2023,59118.0,NJ Clean Energy Program,NJ,PJM,46406.0,,60544.0,13.8,,...,,18605.0,24006.0,,63569.0,7457.0,,18605.0,15.45,
453,2023,60631.0,Upper Michigan Energy Resources Corp.,MI,MISO,,,,,,...,,,,,,,,,,
454,2023,60868.0,Redwood Coast Energy Authority,CA,CISO,120.0,22.0,170.0,0.0,0.0,...,32.0,243.0,120.0,22.0,188.0,167.0,32.0,243.0,12.028,12.0
455,2023,61858.0,San Jose Clean Energy,CA,CISO,2582.0,,2611.0,0.5,,...,,781.0,856.0,,908.0,657.0,,781.0,0.1,
456,2023,65352.0,Umpqua Indian Utility Cooperative,OR,BPAT,67.0,,67.0,,,...,,28.0,40.0,,40.0,,,,10.0,


In [47]:
# Convert Year column to int
efficiency_df['Year'] = efficiency_df['Year'].astype(int)
efficiency_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457 entries, 0 to 456
Data columns (total 31 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Year                                                457 non-null    int64  
 1   Utility Number                                      457 non-null    float64
 2   Utility Name                                        457 non-null    object 
 3   State                                               457 non-null    object 
 4   BA Code                                             451 non-null    object 
 5   Commercial Annual Savings (MWh)                     385 non-null    object 
 6   Industrial Annual Savings (MWh)                     249 non-null    object 
 7   Total Annual Savings (MWh)                          448 non-null    object 
 8   Commercial Peak Demand Annual Savings (MW)          373 non-null    object 
 9  

In [48]:
# Convert Utility Number column to integer type
efficiency_df['Utility Number'] = efficiency_df['Utility Number'].astype(int)
efficiency_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457 entries, 0 to 456
Data columns (total 31 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   Year                                                457 non-null    int64 
 1   Utility Number                                      457 non-null    int64 
 2   Utility Name                                        457 non-null    object
 3   State                                               457 non-null    object
 4   BA Code                                             451 non-null    object
 5   Commercial Annual Savings (MWh)                     385 non-null    object
 6   Industrial Annual Savings (MWh)                     249 non-null    object
 7   Total Annual Savings (MWh)                          448 non-null    object
 8   Commercial Peak Demand Annual Savings (MW)          373 non-null    object
 9   Industrial

In [49]:
efficiency_df['Utility Name'].value_counts()


Utility Name
Tennessee Valley Authority           7
PacifiCorp                           5
Municipal Energy Agency of NE        4
Wabash Valley Power Assn, Inc        4
Southwestern Electric Power Co       3
                                    ..
Jackson County Rural E M C - (IN)    1
Interstate Power and Light Co        1
Illinois Municipal Elec Agency       1
AES Indiana                          1
Umpqua Indian Utility Cooperative    1
Name: count, Length: 421, dtype: int64

Power Reliability dataset and Efficiency datasets both share utility names and states

In [50]:
efficiency_df['BA Code'].isna().value_counts()

BA Code
False    451
True       6
Name: count, dtype: int64

In [51]:
efficiency_df[efficiency_df['BA Code'].isna()]

Unnamed: 0,Year,Utility Number,Utility Name,State,BA Code,Commercial Annual Savings (MWh),Industrial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Industrial Peak Demand Annual Savings (MW),...,Industrial Non-Incentive Annual Costs (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Industrial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Industrial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs),Industrial Weighted Avg Life (Yrs)
107,2023,6616,Fort Pierce Utilities Authority,FL,,,,120,,,...,,2,,,25,,,2,,
117,2023,7353,Golden Valley Elec Assn Inc,AK,,,,15,,,...,,2,,,0,,,13,,
161,2023,10071,Kauai Island Utility Cooperative,HI,,679.0,6.0,2450,0.6,0.0,...,0.0,97,114.0,1.0,413,27.0,0.0,97,14.27,0.13
163,2023,10376,Kissimmee Utility Authority,FL,,141.0,0.0,357,79.9,0.0,...,0.0,0,9.0,0.0,94,7.0,0.0,246,12.044,0.0
290,2023,15783,City of Redding - (CA),CA,,0.0,0.0,6,0.0,0.0,...,,117,,,309,,,117,0.0,0.0
450,2023,58855,Hawaii Energy Efficiency Program,HI,,43594.0,,70519,5.8,,...,,10003,8285.0,,16721,4919.0,,10003,12.7,


In [52]:
# Rename Nan values in BA Code column
efficiency_df['BA Code'] = efficiency_df['BA Code'].fillna('Unknown')
efficiency_df['BA Code'].isna().value_counts()


BA Code
False    457
Name: count, dtype: int64

Exploring missing values

In [53]:
efficiency_df.isna().value_counts()

Year   Utility Number  Utility Name  State  BA Code  Commercial Annual Savings (MWh)  Industrial Annual Savings (MWh)  Total Annual Savings (MWh)  Commercial Peak Demand Annual Savings (MW)  Industrial Peak Demand Annual Savings (MW)  Total Peak Demand Annual Savings (MW)  Commercial Cum Savings (MWh)  Industrial Cum Savings (MWh)  Total Cum Savings (MWh)  Commercial Cum Peak Demand Savings (MWh)  Industrial Cum Peak Demand Savings (MWh)  Total Cum Peak Demand Savings (MWh)  Commercial Annual Incent Cost (Thousand $)  Industrial Annual Incent Cost (Thousand $)  Total Annual Incent Cost (Thousand $)  Commercial Non-Incentive Annual Costs (Thousand $)  Industrial Non-Incentive Annual Costs (Thousand $)  Total Non-Incentive Annual Costs (Thousand $)  Commercial Incentive Cum Costs (Thousand $)  Industrial Incentive Cum Costs (Thousand $)  Total Incentive Cum Costs (Thousand $)  Commercial Non-Incentive Cum Costs (Thousand $)  Industrial Non-Incentive Cum Costs (Thousand $)  Total Non-Ince

In [54]:
efficiency_df['Industrial Annual Savings (MWh)'].isna().value_counts()

Industrial Annual Savings (MWh)
False    249
True     208
Name: count, dtype: int64

In [55]:
efficiency_df['Industrial Cum Savings (MWh)'].isna().value_counts()

Industrial Cum Savings (MWh)
False    247
True     210
Name: count, dtype: int64

In [56]:
efficiency_df['Industrial Annual Incent Cost (Thousand $)'].isna().value_counts()

Industrial Annual Incent Cost (Thousand $)
False    242
True     215
Name: count, dtype: int64

In [57]:
efficiency_df.columns

Index(['Year', 'Utility Number', 'Utility Name', 'State', 'BA Code',
       'Commercial Annual Savings (MWh)', 'Industrial Annual Savings (MWh)',
       'Total Annual Savings (MWh)',
       'Commercial Peak Demand Annual Savings (MW)',
       'Industrial Peak Demand Annual Savings (MW)',
       'Total Peak Demand Annual Savings (MW)', 'Commercial Cum Savings (MWh)',
       'Industrial Cum Savings (MWh)', 'Total Cum Savings (MWh)',
       'Commercial Cum Peak Demand Savings (MWh)',
       'Industrial Cum Peak Demand Savings (MWh)',
       'Total Cum Peak Demand Savings (MWh)',
       'Commercial Annual Incent Cost (Thousand $)',
       'Industrial Annual Incent Cost (Thousand $)',
       'Total Annual Incent Cost (Thousand $)',
       'Commercial Non-Incentive Annual Costs (Thousand $)',
       'Industrial Non-Incentive Annual Costs (Thousand $)',
       'Total Non-Incentive Annual Costs (Thousand $)',
       'Commercial Incentive Cum Costs (Thousand $)',
       'Industrial Incentive Cu

In [58]:
# Drop the Industrial Annual Savings column, half of the values are missing and we will just prioritize Commercial buildings
efficiency_df = efficiency_df.drop(columns=['Industrial Annual Savings (MWh)', 
                                            'Industrial Peak Demand Annual Savings (MW)', 
                                            'Industrial Cum Savings (MWh)',
                                            'Industrial Cum Peak Demand Savings (MWh)',
                                            'Industrial Annual Incent Cost (Thousand $)',
                                            'Industrial Non-Incentive Annual Costs (Thousand $)',
                                            'Industrial Incentive Cum Costs (Thousand $)',
                                            'Industrial Non-Incentive Cum Costs (Thousand $)',
                                            'Industrial Weighted Avg Life (Yrs)'
                                            ])

In [59]:
efficiency_df.columns

Index(['Year', 'Utility Number', 'Utility Name', 'State', 'BA Code',
       'Commercial Annual Savings (MWh)', 'Total Annual Savings (MWh)',
       'Commercial Peak Demand Annual Savings (MW)',
       'Total Peak Demand Annual Savings (MW)', 'Commercial Cum Savings (MWh)',
       'Total Cum Savings (MWh)', 'Commercial Cum Peak Demand Savings (MWh)',
       'Total Cum Peak Demand Savings (MWh)',
       'Commercial Annual Incent Cost (Thousand $)',
       'Total Annual Incent Cost (Thousand $)',
       'Commercial Non-Incentive Annual Costs (Thousand $)',
       'Total Non-Incentive Annual Costs (Thousand $)',
       'Commercial Incentive Cum Costs (Thousand $)',
       'Total Incentive Cum Costs (Thousand $)',
       'Commercial Non-Incentive Cum Costs (Thousand $)',
       'Total Non-Incentive Cum Costs (Thousand $)',
       'Commercial Weighted Avg Life (Yrs)'],
      dtype='object')

In [60]:
efficiency_df['Total Annual Savings (MWh)'].isna().value_counts()

Total Annual Savings (MWh)
False    448
True       9
Name: count, dtype: int64

In [61]:
efficiency_df['Commercial Cum Savings (MWh)'].isna().value_counts()

Commercial Cum Savings (MWh)
False    382
True      75
Name: count, dtype: int64

In [62]:
efficiency_df['Total Cum Peak Demand Savings (MWh)'].isna().value_counts()

Total Cum Peak Demand Savings (MWh)
False    433
True      24
Name: count, dtype: int64

In [63]:
efficiency_df['Commercial Cum Peak Demand Savings (MWh)'].isna().value_counts()


Commercial Cum Peak Demand Savings (MWh)
False    371
True      86
Name: count, dtype: int64

In [64]:
efficiency_df

Unnamed: 0,Year,Utility Number,Utility Name,State,BA Code,Commercial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Total Peak Demand Annual Savings (MW),Commercial Cum Savings (MWh),...,Total Cum Peak Demand Savings (MWh),Commercial Annual Incent Cost (Thousand $),Total Annual Incent Cost (Thousand $),Commercial Non-Incentive Annual Costs (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs)
0,2023,162,Aiken Electric Coop Inc,SC,SC,,206,,0.0,,...,0.0,,,,37,,,,348,
1,2023,189,PowerSouth Energy Cooperative,AL,SOCO,,2550,,2.1,,...,2.1,,848,,526,,848,,526,
2,2023,195,Alabama Power Co,AL,SOCO,759,5767,0.2,11.9,3797,...,11.9,36,1225,151,1095,36,1225,151,1095,5.000
3,2023,207,Alameda Municipal Power,CA,CISO,111,146,0.0,0.1,1460,...,0.1,23,70,146,321,23,70,146,321,13.269
4,2023,295,City of Alexandria - (MN),MN,MISO,673,3306,0.1,0.7,8564,...,0.7,36,187,38,194,36,187,38,194,12.727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,2023,59118,NJ Clean Energy Program,NJ,PJM,46406,60544,13.8,15.6,716989,...,15.6,24006,63569,7457,18605,24006,63569,7457,18605,15.450
453,2023,60631,Upper Michigan Energy Resources Corp.,MI,MISO,,,,,,...,,,,,,,,,,
454,2023,60868,Redwood Coast Energy Authority,CA,CISO,120,170,0.0,0.0,205,...,0.0,120,188,167,243,120,188,167,243,12.028
455,2023,61858,San Jose Clean Energy,CA,CISO,2582,2611,0.5,0.5,17838,...,0.5,856,908,657,781,856,908,657,781,0.100


In [65]:
efficiency_df = efficiency_df.dropna()

efficiency_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 328 entries, 2 to 455
Data columns (total 22 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   Year                                                328 non-null    int64 
 1   Utility Number                                      328 non-null    int64 
 2   Utility Name                                        328 non-null    object
 3   State                                               328 non-null    object
 4   BA Code                                             328 non-null    object
 5   Commercial Annual Savings (MWh)                     328 non-null    object
 6   Total Annual Savings (MWh)                          328 non-null    object
 7   Commercial Peak Demand Annual Savings (MW)          328 non-null    object
 8   Total Peak Demand Annual Savings (MW)               328 non-null    object
 9   Commercial Cum 

In [66]:
efficiency_df.columns

Index(['Year', 'Utility Number', 'Utility Name', 'State', 'BA Code',
       'Commercial Annual Savings (MWh)', 'Total Annual Savings (MWh)',
       'Commercial Peak Demand Annual Savings (MW)',
       'Total Peak Demand Annual Savings (MW)', 'Commercial Cum Savings (MWh)',
       'Total Cum Savings (MWh)', 'Commercial Cum Peak Demand Savings (MWh)',
       'Total Cum Peak Demand Savings (MWh)',
       'Commercial Annual Incent Cost (Thousand $)',
       'Total Annual Incent Cost (Thousand $)',
       'Commercial Non-Incentive Annual Costs (Thousand $)',
       'Total Non-Incentive Annual Costs (Thousand $)',
       'Commercial Incentive Cum Costs (Thousand $)',
       'Total Incentive Cum Costs (Thousand $)',
       'Commercial Non-Incentive Cum Costs (Thousand $)',
       'Total Non-Incentive Cum Costs (Thousand $)',
       'Commercial Weighted Avg Life (Yrs)'],
      dtype='object')

In [67]:
#Drop the year column
efficiency_df = efficiency_df.drop(columns=['Year'])
efficiency_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 328 entries, 2 to 455
Data columns (total 21 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   Utility Number                                      328 non-null    int64 
 1   Utility Name                                        328 non-null    object
 2   State                                               328 non-null    object
 3   BA Code                                             328 non-null    object
 4   Commercial Annual Savings (MWh)                     328 non-null    object
 5   Total Annual Savings (MWh)                          328 non-null    object
 6   Commercial Peak Demand Annual Savings (MW)          328 non-null    object
 7   Total Peak Demand Annual Savings (MW)               328 non-null    object
 8   Commercial Cum Savings (MWh)                        328 non-null    object
 9   Total Cum Savin

In [68]:
# Convert columns into numeric and float

numerics = ['Commercial Annual Savings (MWh)', 'Total Annual Savings (MWh)',
       'Commercial Peak Demand Annual Savings (MW)',
       'Total Peak Demand Annual Savings (MW)', 'Commercial Cum Savings (MWh)',
       'Total Cum Savings (MWh)', 'Commercial Cum Peak Demand Savings (MWh)',
       'Total Cum Peak Demand Savings (MWh)',
       'Commercial Annual Incent Cost (Thousand $)',
       'Total Annual Incent Cost (Thousand $)',
       'Commercial Non-Incentive Annual Costs (Thousand $)',
       'Total Non-Incentive Annual Costs (Thousand $)',
       'Commercial Incentive Cum Costs (Thousand $)',
       'Total Incentive Cum Costs (Thousand $)',
       'Commercial Non-Incentive Cum Costs (Thousand $)',
       'Total Non-Incentive Cum Costs (Thousand $)',
       'Commercial Weighted Avg Life (Yrs)']

for col in numerics:
    efficiency_df[col] = efficiency_df[col].str.replace(',', '').astype(float)
efficiency_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 328 entries, 2 to 455
Data columns (total 21 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Utility Number                                      328 non-null    int64  
 1   Utility Name                                        328 non-null    object 
 2   State                                               328 non-null    object 
 3   BA Code                                             328 non-null    object 
 4   Commercial Annual Savings (MWh)                     328 non-null    float64
 5   Total Annual Savings (MWh)                          328 non-null    float64
 6   Commercial Peak Demand Annual Savings (MW)          328 non-null    float64
 7   Total Peak Demand Annual Savings (MW)               328 non-null    float64
 8   Commercial Cum Savings (MWh)                        328 non-null    float64
 9   Tota

In [74]:
efficiency_df.corr(numeric_only=True)

Unnamed: 0,Year,Utility Number,Commercial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Total Peak Demand Annual Savings (MW),Commercial Cum Savings (MWh),Total Cum Savings (MWh),Commercial Cum Peak Demand Savings (MWh),Total Cum Peak Demand Savings (MWh),Commercial Annual Incent Cost (Thousand $),Total Annual Incent Cost (Thousand $),Commercial Non-Incentive Annual Costs (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs)
Year,,,,,,,,,,,,,,,,,,,
Utility Number,,1.0,0.046751,0.021878,0.030373,0.025283,0.062115,0.053593,0.026258,0.017899,0.075628,0.148453,0.071138,0.083285,0.02458,0.089094,0.029634,0.046776,0.001019
Commercial Annual Savings (MWh),,0.046751,1.0,0.929777,0.32762,0.473145,0.973207,0.945135,0.325489,0.466796,0.722167,0.70188,0.814134,0.875345,0.578504,0.596846,0.643015,0.735935,0.049613
Total Annual Savings (MWh),,0.021878,0.929777,1.0,0.307792,0.473292,0.919809,0.952686,0.307855,0.470183,0.666943,0.639695,0.737366,0.812012,0.526458,0.537144,0.575551,0.675905,0.112121
Commercial Peak Demand Annual Savings (MW),,0.030373,0.32762,0.307792,1.0,0.970818,0.301421,0.293792,0.996078,0.968245,0.24,0.210127,0.251368,0.271805,0.185273,0.174277,0.192864,0.223305,0.016302
Total Peak Demand Annual Savings (MW),,0.025283,0.473145,0.473292,0.970818,1.0,0.452288,0.462315,0.966931,0.995101,0.303784,0.278101,0.359152,0.402326,0.233867,0.228877,0.273053,0.328253,0.037236
Commercial Cum Savings (MWh),,0.062115,0.973207,0.919809,0.301421,0.452288,1.0,0.965529,0.308166,0.452334,0.688883,0.669382,0.772162,0.833859,0.617799,0.631509,0.679438,0.762132,0.109055
Total Cum Savings (MWh),,0.053593,0.945135,0.952686,0.293792,0.462315,0.965529,1.0,0.297816,0.461589,0.686505,0.659218,0.721028,0.800292,0.569717,0.580639,0.596099,0.695021,0.119204
Commercial Cum Peak Demand Savings (MWh),,0.026258,0.325489,0.307855,0.996078,0.966931,0.308166,0.297816,1.0,0.971729,0.239803,0.210242,0.24752,0.270253,0.234876,0.220588,0.240041,0.266128,0.017142
Total Cum Peak Demand Savings (MWh),,0.017899,0.466796,0.470183,0.968245,0.995101,0.452334,0.461589,0.971729,1.0,0.301563,0.27581,0.351913,0.399291,0.269997,0.262354,0.306124,0.359657,0.036411


Drop highly correlated columns

In [69]:
efficiency_df = efficiency_df[['Utility Number', 'Utility Name', 'State', 'BA Code', 'Total Annual Savings (MWh)', 'Total Peak Demand Annual Savings (MW)', 'Total Annual Incent Cost (Thousand $)', 'Total Non-Incentive Annual Costs (Thousand $)', 'Commercial Weighted Avg Life (Yrs)']]
efficiency_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 328 entries, 2 to 455
Data columns (total 9 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Utility Number                                 328 non-null    int64  
 1   Utility Name                                   328 non-null    object 
 2   State                                          328 non-null    object 
 3   BA Code                                        328 non-null    object 
 4   Total Annual Savings (MWh)                     328 non-null    float64
 5   Total Peak Demand Annual Savings (MW)          328 non-null    float64
 6   Total Annual Incent Cost (Thousand $)          328 non-null    float64
 7   Total Non-Incentive Annual Costs (Thousand $)  328 non-null    float64
 8   Commercial Weighted Avg Life (Yrs)             328 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 25.6+ KB


In [70]:
efficiency_df.to_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Efficiency.csv')

## Loading Electricity Demand by Region 2024 Data

In [69]:
edemand_df = pd.read_csv("/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Data/Electricity_Demand_by_Region_2024.csv")
edemand_df.head()



Unnamed: 0,Region Code,Timestamp (Hour Ending),CAL Demand (MWh),CAR Demand (MWh),CENT Demand (MWh),FLA Demand (MWh),MIDA Demand (MWh),MIDW Demand (MWh),NE Demand (MWh),NW Demand (MWh),NY Demand (MWh),SE Demand (MWh),SW Demand (MWh),TEN Demand (MWh),TEX Demand (MWh)
0,US48,"1/1/2024, Eastern Time",625536,621622,759462,526261,2148261,1833823,315183,994573,387760,625201,266989,467122,1083376
1,US48,"1/2/2024, Eastern Time",685735,706554,801892,593216,2354042,2016233,337332,1056483,423513,731575,270920,496786,1217182
2,US48,"1/3/2024, Eastern Time",701915,735424,798097,627980,2409787,2045289,340823,1067854,427709,786139,281432,548142,1169662
3,US48,"1/4/2024, Eastern Time",706456,692597,816406,593522,2428138,2064395,340090,1061712,430238,744147,288273,538419,1168857
4,US48,"1/5/2024, Eastern Time",700442,754904,814876,605782,2490515,2083237,354057,1069388,443452,727947,290707,542626,1123580


This data is organized by regions:
- CAL: California
- NW: Northwest > WA, OR, ID, MA, WY, AK
- CAR: Carolinas > NC, SC
- NY: New York
- SE: Southeast > GA, AL, MS, LA, AR
- FLA: Florida
- SW: Southwest > AZ, NM, NV, UT
- TN: Tennessee
- MIDW: Midwest > IL, IN,IA, MI, MN, OH,WI, MO
- TEX: Texas
- NE: Northeast > ME, VT, NH, MA, CT, RI, NJ, PA




In [70]:
edemand_df = pd.read_csv("/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Data/Electricity_Demand_by_Region_2023.csv")
edemand_df.head()

Unnamed: 0,Region Code,Timestamp (Hour Ending),CAL Demand (MWh),CAR Demand (MWh),CENT Demand (MWh),FLA Demand (MWh),MIDA Demand (MWh),MIDW Demand (MWh),NE Demand (MWh),NW Demand (MWh),NY Demand (MWh),SE Demand (MWh),SW Demand (MWh),TEN Demand (MWh),TEX Demand (MWh)
0,US48,"1/1/2023, Eastern Time",621178,478148,644349,574546,1844295,1637233,273148,983651,357120,477850,255475,342521,899540
1,US48,"1/2/2023, Eastern Time",692440,519117,669956,595192,1939503,1710255,303550,1069779,377848,512066,269185,354751,970461
2,US48,"1/3/2023, Eastern Time",736465,530603,719373,621410,2040208,1819416,335429,1094423,404670,540743,275886,367694,965338
3,US48,"1/4/2023, Eastern Time",732759,524179,760561,642887,2025808,1866522,323809,1098302,392449,537773,266982,374897,981661
4,US48,"1/5/2023, Eastern Time",718498,538507,771291,608675,2104236,1952957,324424,1059004,395100,575151,271869,429229,1011497


In [71]:
edemand_df.tail()

Unnamed: 0,Region Code,Timestamp (Hour Ending),CAL Demand (MWh),CAR Demand (MWh),CENT Demand (MWh),FLA Demand (MWh),MIDA Demand (MWh),MIDW Demand (MWh),NE Demand (MWh),NW Demand (MWh),NY Demand (MWh),SE Demand (MWh),SW Demand (MWh),TEN Demand (MWh),TEX Demand (MWh)
360,US48,"12/27/2023, Eastern Time",698940,506141,787746,560573,2079566,1857454,314432,1047035,394055,569277,276845,422241,1131652
361,US48,"12/28/2023, Eastern Time",690881,545918,797687,560398,2084096,1926044,319065,1038616,389848,616299,271874,454805,1161632
362,US48,"12/29/2023, Eastern Time",680002,634698,785415,586583,2126024,1936995,313799,1014444,384132,675125,268698,477732,1180330
363,US48,"12/30/2023, Eastern Time",659629,651583,745495,611345,2128133,1862303,300851,967325,382382,671895,259653,468637,1120936
364,US48,"12/31/2023, Eastern Time",657024,650208,750189,609399,2107102,1827378,310484,973562,390049,675617,255519,442586,1060853


In [72]:
# Create a mapping of regions to their constituent states
region_to_state = {
    'CAL':['CA'],
    'NW': ['WA', 'OR', 'ID', 'MA', 'WY', 'AK'],
    'CAR': ['NC', 'SC'],
    'NY': ['NY'],
    'MIDA': ['PA', 'NJ', 'DE', 'MD', 'VA', 'DC'],
    'SE':['GA', 'AL', 'MS', 'LA', 'AR'],
    'FLA': ['FL'],
    'SW': ['AZ', 'NM', 'NV', 'UT'],
    'TN': ['TN'],
    'MIDW': ['IL', 'IN','IA', 'MI', 'MN', 'OH','WI'],
    'TEX': ['TX'],
    'NE': ['ME', 'VT', 'NH', 'MA', 'CT', 'RI'],
    'CENT': ['KS', 'NE', 'SD', 'ND','OK', 'MO']
}

In [73]:
# Set the date as index
edemand_df['Date'] = pd.to_datetime(edemand_df['Timestamp (Hour Ending)'].str.split(',').str[0])

edemand_df['Date']


0     2023-01-01
1     2023-01-02
2     2023-01-03
3     2023-01-04
4     2023-01-05
         ...    
360   2023-12-27
361   2023-12-28
362   2023-12-29
363   2023-12-30
364   2023-12-31
Name: Date, Length: 365, dtype: datetime64[ns]

In [74]:
# Convert from wide to long format by melting
demand_melted = edemand_df.melt(
    id_vars=['Date', 'Region Code'],
    value_vars=[col for col in edemand_df.columns if 'Demand' in col],
    var_name='Region',
    value_name='Demand_MWh'
)
demand_melted.head()

Unnamed: 0,Date,Region Code,Region,Demand_MWh
0,2023-01-01,US48,CAL Demand (MWh),621178
1,2023-01-02,US48,CAL Demand (MWh),692440
2,2023-01-03,US48,CAL Demand (MWh),736465
3,2023-01-04,US48,CAL Demand (MWh),732759
4,2023-01-05,US48,CAL Demand (MWh),718498


In [75]:
demand_melted['Region'].value_counts()

Region
CAL Demand (MWh)     365
CAR Demand (MWh)     365
CENT Demand (MWh)    365
FLA Demand (MWh)     365
MIDA Demand (MWh)    365
MIDW Demand (MWh)    365
NE Demand (MWh)      365
NW Demand (MWh)      365
NY Demand (MWh)      365
SE Demand (MWh)      365
SW Demand (MWh)      365
TEN Demand (MWh)     365
TEX Demand (MWh)     365
Name: count, dtype: int64

In [76]:
# Strip the extra characters to just leave Region
demand_melted['Region'] = demand_melted['Region'].str[:-12]
demand_melted['Region'].value_counts()


Region
CAL      365
CAR      365
CENT     365
FLA      365
MIDA     365
MIDW     365
NE       365
NW       365
NY       365
SE       365
SW       365
TEN      365
TEX      365
Name: count, dtype: int64

In [77]:
# Strip any whitespace
demand_melted['Region'] = demand_melted['Region'].str.strip()
demand_melted['Region'].value_counts()


Region
CAL     365
CAR     365
CENT    365
FLA     365
MIDA    365
MIDW    365
NE      365
NW      365
NY      365
SE      365
SW      365
TEN     365
TEX     365
Name: count, dtype: int64

In [78]:
# Convert to object type
demand_melted['Region'] = demand_melted['Region'].astype(object)
demand_melted.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4745 entries, 0 to 4744
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         4745 non-null   datetime64[ns]
 1   Region Code  4745 non-null   object        
 2   Region       4745 non-null   object        
 3   Demand_MWh   4745 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 148.4+ KB


In [85]:
#Expand regions to states
demand_by_state = []
for _, row in demand_melted.iterrows():
    region = row['Region']
    if region in region_to_state:
        for state in region_to_state[region]:
            demand_by_state.append({
                'Date': row['Date'],
                'Region': region,
                'State': state,
                'Region_Demand_MWh': row['Demand_MWh']
            })


In [87]:
regional_demand = pd.DataFrame(demand_by_state)
regional_demand.head()

Unnamed: 0,Date,Region,State,Region_Demand_MWh
0,2023-01-01,CAL,CA,621178
1,2023-01-02,CAL,CA,692440
2,2023-01-03,CAL,CA,736465
3,2023-01-04,CAL,CA,732759
4,2023-01-05,CAL,CA,718498


In [88]:
regional_demand.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16790 entries, 0 to 16789
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               16790 non-null  datetime64[ns]
 1   Region             16790 non-null  object        
 2   State              16790 non-null  object        
 3   Region_Demand_MWh  16790 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 524.8+ KB


In [92]:
regional_demand.columns

Index(['Date', 'Region', 'State', 'Region_Demand_MWh'], dtype='object')

In [93]:
regional_demand['Region_Demand_MWh'].value_counts()

Region_Demand_MWh
1810431    14
724370     12
260062     12
727436     12
714287     12
           ..
560398      1
560573      1
575811      1
548169      1
1060853     1
Name: count, Length: 4368, dtype: int64

In [98]:
regional_demand['Annual_Region_Demand_MWh'] = regional_demand.groupby('State')['Region_Demand_MWh'].transform('mean')

In [101]:
regional_demand = regional_demand.rename(columns = {'Region_Demand_MWh':'Daily_Rgnl_Demand_MWh'})
regional_demand.head()

Unnamed: 0,Date,Region,State,Daily_Rgnl_Demand_MWh,Annual_Region_Demand_MWh
0,2023-01-01,CAL,CA,621178,726482.230137
1,2023-01-02,CAL,CA,692440,726482.230137
2,2023-01-03,CAL,CA,736465,726482.230137
3,2023-01-04,CAL,CA,732759,726482.230137
4,2023-01-05,CAL,CA,718498,726482.230137


In [102]:
regional_demand = regional_demand.drop(columns=['Date', 'Daily_Rgnl_Demand_MWh'])

regional_demand.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16790 entries, 0 to 16789
Data columns (total 3 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Region                    16790 non-null  object 
 1   State                     16790 non-null  object 
 2   Annual_Region_Demand_MWh  16790 non-null  float64
dtypes: float64(1), object(2)
memory usage: 393.6+ KB


In [103]:
regional_demand.to_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Regional_Demand_2023.csv')