# Joining the Data

It is now time for the scary part. But we never back away from a challenge! Let's say a prayer and get ready to join the data.

In [1]:
import pandas as pd
import numpy as np

### Loading in Utility Scale Net Generation Data and Reliability of Distribution Systems Date 

In [2]:
net_gen = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Utility_Scale_Net_Generation.csv', index_col=0)

net_gen.head()



Unnamed: 0,State,% Chg All Facilities Generation,Total_Generation_Rank,Electric_Generation_Rank,Independent_Generation_Rank,Commercial_Generation_Rank,Industrial_Generation_Rank,Capacity_Growth_Status
0,CT,9.4,Moderate,Low,High,Moderate,Low,Growing
1,ME,2.8,Low,Low,Moderate,Low,Moderate,Growing
2,MA,6.4,Low,Low,Moderate,High,Low,Growing
3,NH,22.9,Low,Low,Moderate,Low,Low,Growing
4,RI,-0.5,Low,Low,Moderate,Low,Low,Shrinking


In [3]:
net_gen = net_gen.drop(columns=['% Chg All Facilities Generation'])

In [4]:
net_gen.head()

Unnamed: 0,State,Total_Generation_Rank,Electric_Generation_Rank,Independent_Generation_Rank,Commercial_Generation_Rank,Industrial_Generation_Rank,Capacity_Growth_Status
0,CT,Moderate,Low,High,Moderate,Low,Growing
1,ME,Low,Low,Moderate,Low,Moderate,Growing
2,MA,Low,Low,Moderate,High,Low,Growing
3,NH,Low,Low,Moderate,Low,Low,Growing
4,RI,Low,Low,Moderate,Low,Low,Shrinking


In [5]:
net_gen['State'].value_counts().sum()

51

In [6]:
reliability = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Reliability_Distribution.csv', index_col=0)

reliability.head()

Unnamed: 0,Census Division,State,Percent of Customers Reported w/ Major Events,SAIDI w/ Major Events,SAIFI w/ Major Events,CAIDI w/ Major Events,SAIDI w/o Major Events,SAIFI w/o Major Events,CAIDI w/o Major Events
0,New England,CT,100.6,164.6,0.9,188.8,70.3,0.7,107.3
1,New England,ME,100.8,1863.0,3.3,562.2,247.4,2.0,125.0
2,New England,MA,88.2,259.4,1.1,245.4,82.2,0.8,105.4
3,New England,NH,99.3,645.8,1.8,368.5,124.6,1.0,126.1
4,New England,RI,98.3,104.6,0.8,133.7,52.2,0.7,78.3


In [7]:
reliability.describe()

Unnamed: 0,Percent of Customers Reported w/ Major Events,SAIDI w/ Major Events,SAIFI w/ Major Events,CAIDI w/ Major Events,SAIDI w/o Major Events,SAIFI w/o Major Events,CAIDI w/o Major Events
count,51.0,51.0,51.0,51.0,51.0,51.0,51.0
mean,91.984314,358.57451,1.386275,226.466667,125.866667,1.054902,116.519608
std,7.457758,346.660787,0.636873,127.799489,68.936497,0.434656,23.595296
min,73.6,71.9,0.4,76.2,33.3,0.3,70.8
25%,88.3,126.95,0.9,134.85,72.9,0.8,101.35
50%,93.6,208.6,1.1,186.9,117.1,0.9,118.0
75%,97.75,473.6,1.8,282.2,146.0,1.25,124.95
max,102.7,1863.0,3.3,723.5,385.7,2.4,206.2


The following columns in reliability dataset will be binned based on their wide distributions:
- 'SAIDI w/ Major Events'     # Wide range: 71.9 to 1863.0 minutes
- 'SAIFI w/ Major Events'     # Meaningful range: 0.4 to 3.3 interruptions
- 'SAIDI w/o Major Events'    # Significant spread: 33.3 to 385.7 minutes
- 'SAIFI w/o Major Events'    # Clear variation: 0.3 to 2.4 interruptions


In [8]:
# Binning SAIDI (Average Interruption Duration Index) with Major Events
conditions_saidi = [
    reliability['SAIDI w/ Major Events'] >= 473.6,  # 75th percentile
    (reliability['SAIDI w/ Major Events'] >= 208.6) & (reliability['SAIDI w/ Major Events'] < 473.6),  # 50th-75th
    reliability['SAIDI w/ Major Events'] < 208.6  # Below median
]
choices_saidi = ['Long Interruption Duration', 'Moderate Interruption Duration', 'Short Interruption Duration']
reliability['SAIDI Major Events'] = np.select(conditions_saidi, choices_saidi, default='Unknown')

# Bin SAIFI (Average Interruption Frequency Index) w/ Major Events
conditions_saifi = [
    reliability['SAIFI w/ Major Events'] > 1.8,  # 75th percentile
    (reliability['SAIFI w/ Major Events'] >= 1.1) & (reliability['SAIFI w/ Major Events'] <= 1.8),  # 50th-75th
    reliability['SAIFI w/ Major Events'] < 1.1  # Below median
]
choices_saifi = ['High Interruption Frequency', 'Moderate Interruption Frequency', 'Low Interruption Frequency']
reliability['SAIFI Major Events'] = np.select(conditions_saifi, choices_saifi, default='Unknown')

In [9]:
# Binning SAIDI (Average Interruption Duration Index) without  Major Events
conditions_saidi = [
    reliability['SAIDI w/o Major Events'] >= 146,  
    (reliability['SAIDI w/o Major Events'] >= 117) & (reliability['SAIDI w/o Major Events'] < 146), 
    reliability['SAIDI w/o Major Events'] < 117  
]
choices_saidi = ['Long Interruption Duration', 'Moderate Interruption Duration', 'Short Interruption Duration']
reliability['SAIDI No Major Events'] = np.select(conditions_saidi, choices_saidi, default='Unknown')

# Bin SAIFI (Average Interruption Frequency Index) w/o Major Events
conditions_saifi = [
    reliability['SAIFI w/o Major Events'] > 1.25,  
    (reliability['SAIFI w/o Major Events'] >= .9) & (reliability['SAIFI w/o Major Events'] <= 1.25), 
    reliability['SAIFI w/o Major Events'] < .9
]
choices_saifi = ['High Interruption Frequency', 'Moderate Interruption Frequency', 'Low Interruption Frequency']
reliability['SAIFI No Major Events'] = np.select(conditions_saifi, choices_saifi, default='Unknown') 

In [10]:
reliability.head()

Unnamed: 0,Census Division,State,Percent of Customers Reported w/ Major Events,SAIDI w/ Major Events,SAIFI w/ Major Events,CAIDI w/ Major Events,SAIDI w/o Major Events,SAIFI w/o Major Events,CAIDI w/o Major Events,SAIDI Major Events,SAIFI Major Events,SAIDI No Major Events,SAIFI No Major Events
0,New England,CT,100.6,164.6,0.9,188.8,70.3,0.7,107.3,Short Interruption Duration,Low Interruption Frequency,Short Interruption Duration,Low Interruption Frequency
1,New England,ME,100.8,1863.0,3.3,562.2,247.4,2.0,125.0,Long Interruption Duration,High Interruption Frequency,Long Interruption Duration,High Interruption Frequency
2,New England,MA,88.2,259.4,1.1,245.4,82.2,0.8,105.4,Moderate Interruption Duration,Moderate Interruption Frequency,Short Interruption Duration,Low Interruption Frequency
3,New England,NH,99.3,645.8,1.8,368.5,124.6,1.0,126.1,Long Interruption Duration,Moderate Interruption Frequency,Moderate Interruption Duration,Moderate Interruption Frequency
4,New England,RI,98.3,104.6,0.8,133.7,52.2,0.7,78.3,Short Interruption Duration,Low Interruption Frequency,Short Interruption Duration,Low Interruption Frequency


In [11]:
reliability = reliability.drop(columns=['SAIDI w/ Major Events', 'SAIFI w/ Major Events', 'SAIDI w/o Major Events', 'SAIFI w/o Major Events'])
reliability.head()

Unnamed: 0,Census Division,State,Percent of Customers Reported w/ Major Events,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIDI Major Events,SAIFI Major Events,SAIDI No Major Events,SAIFI No Major Events
0,New England,CT,100.6,188.8,107.3,Short Interruption Duration,Low Interruption Frequency,Short Interruption Duration,Low Interruption Frequency
1,New England,ME,100.8,562.2,125.0,Long Interruption Duration,High Interruption Frequency,Long Interruption Duration,High Interruption Frequency
2,New England,MA,88.2,245.4,105.4,Moderate Interruption Duration,Moderate Interruption Frequency,Short Interruption Duration,Low Interruption Frequency
3,New England,NH,99.3,368.5,126.1,Long Interruption Duration,Moderate Interruption Frequency,Moderate Interruption Duration,Moderate Interruption Frequency
4,New England,RI,98.3,133.7,78.3,Short Interruption Duration,Low Interruption Frequency,Short Interruption Duration,Low Interruption Frequency


In [12]:
reliability['State'].value_counts().sum()

51

In [13]:
# Conduct an outer join on the State column
net_gen_reliability = pd.merge(net_gen, reliability, on='State', how='outer')

net_gen_reliability.head()

Unnamed: 0,State,Total_Generation_Rank,Electric_Generation_Rank,Independent_Generation_Rank,Commercial_Generation_Rank,Industrial_Generation_Rank,Capacity_Growth_Status,Census Division,Percent of Customers Reported w/ Major Events,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIDI Major Events,SAIFI Major Events,SAIDI No Major Events,SAIFI No Major Events
0,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,137.6,Moderate Interruption Duration,High Interruption Frequency,Long Interruption Duration,High Interruption Frequency
1,AL,High,High,High,Low,High,Growing,East South Central,80.0,186.9,122.8,Moderate Interruption Duration,Moderate Interruption Frequency,Moderate Interruption Duration,Moderate Interruption Frequency
2,AR,Moderate,Moderate,Low,Low,Moderate,Shrinking,West South Central,89.3,433.4,144.0,Long Interruption Duration,High Interruption Frequency,Long Interruption Duration,High Interruption Frequency
3,AZ,High,High,High,Moderate,Low,Growing,Mountain,96.7,108.6,84.8,Short Interruption Duration,Low Interruption Frequency,Short Interruption Duration,Low Interruption Frequency
4,CA,High,High,High,High,High,Shrinking,Pacific Contiguous,100.2,248.1,140.0,Moderate Interruption Duration,Moderate Interruption Frequency,Long Interruption Duration,Moderate Interruption Frequency


In [14]:
net_gen_reliability.isna().value_counts()

State  Total_Generation_Rank  Electric_Generation_Rank  Independent_Generation_Rank  Commercial_Generation_Rank  Industrial_Generation_Rank  Capacity_Growth_Status  Census Division  Percent of Customers Reported w/ Major Events  CAIDI w/ Major Events  CAIDI w/o Major Events  SAIDI Major Events  SAIFI Major Events  SAIDI No Major Events  SAIFI No Major Events
False  False                  False                     False                        False                       False                       False                   False            False                                          False                  False                   False               False               False                  False                    51
Name: count, dtype: int64

## Load in the Energy Efficiency data

In [15]:
energy_efficiency = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Efficiency.csv', index_col=0)

energy_efficiency.head()

Unnamed: 0,Year,Utility Number,Utility Name,State,BA Code,Commercial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Total Peak Demand Annual Savings (MW),Commercial Cum Savings (MWh),...,Total Cum Peak Demand Savings (MWh),Commercial Annual Incent Cost (Thousand $),Total Annual Incent Cost (Thousand $),Commercial Non-Incentive Annual Costs (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs)
0,2023,162,Aiken Electric Coop Inc,SC,SC,1686.0,206.0,0.4,0.0,18796.5,...,0.0,162.0,443.0,118.0,37.0,190.0,487.0,145.0,348.0,12.574
1,2023,189,PowerSouth Energy Cooperative,AL,SOCO,1686.0,2550.0,0.4,2.1,18796.5,...,2.1,162.0,848.0,118.0,526.0,190.0,848.0,145.0,526.0,12.574
2,2023,195,Alabama Power Co,AL,SOCO,759.0,5767.0,0.2,11.9,3797.0,...,11.9,36.0,1225.0,151.0,1095.0,36.0,1225.0,151.0,1095.0,5.0
3,2023,207,Alameda Municipal Power,CA,CISO,111.0,146.0,0.0,0.1,1460.0,...,0.1,23.0,70.0,146.0,321.0,23.0,70.0,146.0,321.0,13.269
4,2023,295,City of Alexandria - (MN),MN,MISO,673.0,3306.0,0.1,0.7,8564.0,...,0.7,36.0,187.0,38.0,194.0,36.0,187.0,38.0,194.0,12.727


In [16]:
energy_efficiency.columns

Index(['Year', 'Utility Number', 'Utility Name', 'State', 'BA Code',
       'Commercial Annual Savings (MWh)', 'Total Annual Savings (MWh)',
       'Commercial Peak Demand Annual Savings (MW)',
       'Total Peak Demand Annual Savings (MW)', 'Commercial Cum Savings (MWh)',
       'Total Cum Savings (MWh)', 'Commercial Cum Peak Demand Savings (MWh)',
       'Total Cum Peak Demand Savings (MWh)',
       'Commercial Annual Incent Cost (Thousand $)',
       'Total Annual Incent Cost (Thousand $)',
       'Commercial Non-Incentive Annual Costs (Thousand $)',
       'Total Non-Incentive Annual Costs (Thousand $)',
       'Commercial Incentive Cum Costs (Thousand $)',
       'Total Incentive Cum Costs (Thousand $)',
       'Commercial Non-Incentive Cum Costs (Thousand $)',
       'Total Non-Incentive Cum Costs (Thousand $)',
       'Commercial Weighted Avg Life (Yrs)'],
      dtype='object')

In [17]:
# Drop the year and Utility number columns
energy_efficiency = energy_efficiency.drop(columns=['Year'])

energy_efficiency.head()

Unnamed: 0,Utility Number,Utility Name,State,BA Code,Commercial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Total Peak Demand Annual Savings (MW),Commercial Cum Savings (MWh),Total Cum Savings (MWh),...,Total Cum Peak Demand Savings (MWh),Commercial Annual Incent Cost (Thousand $),Total Annual Incent Cost (Thousand $),Commercial Non-Incentive Annual Costs (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs)
0,162,Aiken Electric Coop Inc,SC,SC,1686.0,206.0,0.4,0.0,18796.5,27204.0,...,0.0,162.0,443.0,118.0,37.0,190.0,487.0,145.0,348.0,12.574
1,189,PowerSouth Energy Cooperative,AL,SOCO,1686.0,2550.0,0.4,2.1,18796.5,28552.0,...,2.1,162.0,848.0,118.0,526.0,190.0,848.0,145.0,526.0,12.574
2,195,Alabama Power Co,AL,SOCO,759.0,5767.0,0.2,11.9,3797.0,45489.0,...,11.9,36.0,1225.0,151.0,1095.0,36.0,1225.0,151.0,1095.0,5.0
3,207,Alameda Municipal Power,CA,CISO,111.0,146.0,0.0,0.1,1460.0,1958.0,...,0.1,23.0,70.0,146.0,321.0,23.0,70.0,146.0,321.0,13.269
4,295,City of Alexandria - (MN),MN,MISO,673.0,3306.0,0.1,0.7,8564.0,43191.0,...,0.7,36.0,187.0,38.0,194.0,36.0,187.0,38.0,194.0,12.727


In [18]:
energy_efficiency.drop(columns=['Utility Number'], inplace=True)

energy_efficiency.head()

Unnamed: 0,Utility Name,State,BA Code,Commercial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Total Peak Demand Annual Savings (MW),Commercial Cum Savings (MWh),Total Cum Savings (MWh),Commercial Cum Peak Demand Savings (MWh),Total Cum Peak Demand Savings (MWh),Commercial Annual Incent Cost (Thousand $),Total Annual Incent Cost (Thousand $),Commercial Non-Incentive Annual Costs (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs)
0,Aiken Electric Coop Inc,SC,SC,1686.0,206.0,0.4,0.0,18796.5,27204.0,0.4,0.0,162.0,443.0,118.0,37.0,190.0,487.0,145.0,348.0,12.574
1,PowerSouth Energy Cooperative,AL,SOCO,1686.0,2550.0,0.4,2.1,18796.5,28552.0,0.4,2.1,162.0,848.0,118.0,526.0,190.0,848.0,145.0,526.0,12.574
2,Alabama Power Co,AL,SOCO,759.0,5767.0,0.2,11.9,3797.0,45489.0,0.2,11.9,36.0,1225.0,151.0,1095.0,36.0,1225.0,151.0,1095.0,5.0
3,Alameda Municipal Power,CA,CISO,111.0,146.0,0.0,0.1,1460.0,1958.0,0.0,0.1,23.0,70.0,146.0,321.0,23.0,70.0,146.0,321.0,13.269
4,City of Alexandria - (MN),MN,MISO,673.0,3306.0,0.1,0.7,8564.0,43191.0,0.1,0.7,36.0,187.0,38.0,194.0,36.0,187.0,38.0,194.0,12.727


In [19]:
energy_efficiency.describe()

Unnamed: 0,Commercial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Total Peak Demand Annual Savings (MW),Commercial Cum Savings (MWh),Total Cum Savings (MWh),Commercial Cum Peak Demand Savings (MWh),Total Cum Peak Demand Savings (MWh),Commercial Annual Incent Cost (Thousand $),Total Annual Incent Cost (Thousand $),Commercial Non-Incentive Annual Costs (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs)
count,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0,457.0
mean,24441.006565,53052.45,7.055142,12.647155,322991.7,592329.1,7.107221,12.415536,3904.929978,8442.730853,2489.074398,5583.444201,4684.387309,9676.682713,2958.343545,6423.2407,12.400501
std,75413.36269,162306.9,41.803702,53.838307,1010679.0,1833131.0,41.896196,53.788817,14333.723818,28248.946275,9763.534472,20155.953151,20126.649167,36695.365003,13411.016916,25486.23149,4.104284
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,337.0,528.0,0.1,0.2,4491.0,6296.0,0.1,0.2,36.0,66.0,31.0,58.0,40.0,83.0,37.0,62.0,11.35
50%,1686.0,2631.5,0.4,0.75,18796.5,30374.0,0.4,0.8,162.0,443.0,118.0,240.0,190.0,487.0,145.0,282.0,12.574
75%,10890.0,25662.0,2.2,5.9,141941.0,334790.0,2.1,5.9,1372.0,3338.0,605.0,1845.0,1430.0,3338.0,605.0,1962.0,14.0
max,756720.0,1718664.0,804.9,945.0,9244884.0,19875820.0,804.9,945.0,180076.0,372401.0,106954.0,247704.0,289844.0,495994.0,198028.0,334853.0,27.63


** Note that MW measures power capacity while MWh measures the amount of electricity delivered overtime

In [20]:
# Bin Total Annual Savings (MWh)
conditions_savings = [
    energy_efficiency['Total Annual Savings (MWh)'] >= energy_efficiency['Total Annual Savings (MWh)'].quantile(0.75),
    (energy_efficiency['Total Annual Savings (MWh)'] >= energy_efficiency['Total Annual Savings (MWh)'].quantile(0.25)) & 
    (energy_efficiency['Total Annual Savings (MWh)'] < energy_efficiency['Total Annual Savings (MWh)'].quantile(0.75)),
    energy_efficiency['Total Annual Savings (MWh)'] < energy_efficiency['Total Annual Savings (MWh)'].quantile(0.25)
]
choices_savings = ['High', 'Moderate', 'Low']
energy_efficiency['Overall Annual Energy Savings (MWh)'] = np.select(conditions_savings, choices_savings, default='Unknown')

# Bin Peak Demand Savings
conditions_peak = [
    energy_efficiency['Total Peak Demand Annual Savings (MW)'] >= energy_efficiency['Total Peak Demand Annual Savings (MW)'].quantile(0.75),
    (energy_efficiency['Total Peak Demand Annual Savings (MW)'] >= energy_efficiency['Total Peak Demand Annual Savings (MW)'].quantile(0.25)) & 
    (energy_efficiency['Total Peak Demand Annual Savings (MW)'] < energy_efficiency['Total Peak Demand Annual Savings (MW)'].quantile(0.75)),
    energy_efficiency['Total Peak Demand Annual Savings (MW)'] < energy_efficiency['Total Peak Demand Annual Savings (MW)'].quantile(0.25)
]
choices_peak = ['High', 'Moderate', 'Low']
energy_efficiency['Overall Peak Demand Savings (MW)'] = np.select(conditions_peak, choices_peak, default='Unknown')

In [21]:
energy_efficiency.head()

Unnamed: 0,Utility Name,State,BA Code,Commercial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Total Peak Demand Annual Savings (MW),Commercial Cum Savings (MWh),Total Cum Savings (MWh),Commercial Cum Peak Demand Savings (MWh),...,Total Annual Incent Cost (Thousand $),Commercial Non-Incentive Annual Costs (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs),Overall Annual Energy Savings (MWh),Overall Peak Demand Savings (MW)
0,Aiken Electric Coop Inc,SC,SC,1686.0,206.0,0.4,0.0,18796.5,27204.0,0.4,...,443.0,118.0,37.0,190.0,487.0,145.0,348.0,12.574,Low,Low
1,PowerSouth Energy Cooperative,AL,SOCO,1686.0,2550.0,0.4,2.1,18796.5,28552.0,0.4,...,848.0,118.0,526.0,190.0,848.0,145.0,526.0,12.574,Moderate,Moderate
2,Alabama Power Co,AL,SOCO,759.0,5767.0,0.2,11.9,3797.0,45489.0,0.2,...,1225.0,151.0,1095.0,36.0,1225.0,151.0,1095.0,5.0,Moderate,High
3,Alameda Municipal Power,CA,CISO,111.0,146.0,0.0,0.1,1460.0,1958.0,0.0,...,70.0,146.0,321.0,23.0,70.0,146.0,321.0,13.269,Low,Low
4,City of Alexandria - (MN),MN,MISO,673.0,3306.0,0.1,0.7,8564.0,43191.0,0.1,...,187.0,38.0,194.0,36.0,187.0,38.0,194.0,12.727,Moderate,Moderate


In [22]:
energy_efficiency.columns


Index(['Utility Name', 'State', 'BA Code', 'Commercial Annual Savings (MWh)',
       'Total Annual Savings (MWh)',
       'Commercial Peak Demand Annual Savings (MW)',
       'Total Peak Demand Annual Savings (MW)', 'Commercial Cum Savings (MWh)',
       'Total Cum Savings (MWh)', 'Commercial Cum Peak Demand Savings (MWh)',
       'Total Cum Peak Demand Savings (MWh)',
       'Commercial Annual Incent Cost (Thousand $)',
       'Total Annual Incent Cost (Thousand $)',
       'Commercial Non-Incentive Annual Costs (Thousand $)',
       'Total Non-Incentive Annual Costs (Thousand $)',
       'Commercial Incentive Cum Costs (Thousand $)',
       'Total Incentive Cum Costs (Thousand $)',
       'Commercial Non-Incentive Cum Costs (Thousand $)',
       'Total Non-Incentive Cum Costs (Thousand $)',
       'Commercial Weighted Avg Life (Yrs)',
       'Overall Annual Energy Savings (MWh)',
       'Overall Peak Demand Savings (MW)'],
      dtype='object')

Let's aggregate some features to simplify the dataframe

In [23]:
# Simplified aggregation with total values only
state_efficiency = energy_efficiency.groupby('State').agg({
    'Total Annual Savings (MWh)': 'sum',
    'Total Peak Demand Annual Savings (MW)': 'sum'
}).reset_index()

# Create categories for both metrics
conditions_energy = [
    state_efficiency['Total Annual Savings (MWh)'] >= state_efficiency['Total Annual Savings (MWh)'].quantile(0.75),
    (state_efficiency['Total Annual Savings (MWh)'] >= state_efficiency['Total Annual Savings (MWh)'].quantile(0.25)),
    state_efficiency['Total Annual Savings (MWh)'] < state_efficiency['Total Annual Savings (MWh)'].quantile(0.25)
]
choices = ['High Savings', 'Moderate Savings', 'Low Savings']
state_efficiency['State_Total_Energy_Savings'] = np.select(conditions_energy, choices, default='Unknown')

conditions_peak = [
    state_efficiency['Total Peak Demand Annual Savings (MW)'] >= state_efficiency['Total Peak Demand Annual Savings (MW)'].quantile(0.75),
    (state_efficiency['Total Peak Demand Annual Savings (MW)'] >= state_efficiency['Total Peak Demand Annual Savings (MW)'].quantile(0.25)),
    state_efficiency['Total Peak Demand Annual Savings (MW)'] < state_efficiency['Total Peak Demand Annual Savings (MW)'].quantile(0.25)
]
state_efficiency['State_Total_Peak_Savings'] = np.select(conditions_peak, choices, default='Unknown')

In [24]:
# Merge back to original dataframe
energy_efficiency = pd.merge(energy_efficiency, state_efficiency[['State', 'State_Total_Energy_Savings', 'State_Total_Peak_Savings']], 
                           on='State', how='left')
energy_efficiency.head()

Unnamed: 0,Utility Name,State,BA Code,Commercial Annual Savings (MWh),Total Annual Savings (MWh),Commercial Peak Demand Annual Savings (MW),Total Peak Demand Annual Savings (MW),Commercial Cum Savings (MWh),Total Cum Savings (MWh),Commercial Cum Peak Demand Savings (MWh),...,Total Non-Incentive Annual Costs (Thousand $),Commercial Incentive Cum Costs (Thousand $),Total Incentive Cum Costs (Thousand $),Commercial Non-Incentive Cum Costs (Thousand $),Total Non-Incentive Cum Costs (Thousand $),Commercial Weighted Avg Life (Yrs),Overall Annual Energy Savings (MWh),Overall Peak Demand Savings (MW),State_Total_Energy_Savings,State_Total_Peak_Savings
0,Aiken Electric Coop Inc,SC,SC,1686.0,206.0,0.4,0.0,18796.5,27204.0,0.4,...,37.0,190.0,487.0,145.0,348.0,12.574,Low,Low,Moderate Savings,Moderate Savings
1,PowerSouth Energy Cooperative,AL,SOCO,1686.0,2550.0,0.4,2.1,18796.5,28552.0,0.4,...,526.0,190.0,848.0,145.0,526.0,12.574,Moderate,Moderate,Low Savings,Moderate Savings
2,Alabama Power Co,AL,SOCO,759.0,5767.0,0.2,11.9,3797.0,45489.0,0.2,...,1095.0,36.0,1225.0,151.0,1095.0,5.0,Moderate,High,Low Savings,Moderate Savings
3,Alameda Municipal Power,CA,CISO,111.0,146.0,0.0,0.1,1460.0,1958.0,0.0,...,321.0,23.0,70.0,146.0,321.0,13.269,Low,Low,High Savings,High Savings
4,City of Alexandria - (MN),MN,MISO,673.0,3306.0,0.1,0.7,8564.0,43191.0,0.1,...,194.0,36.0,187.0,38.0,194.0,12.727,Moderate,Moderate,High Savings,High Savings


In [25]:
# Keep only relevant columns
energy_efficiency_clean = energy_efficiency[['State', 'Utility Name', 'BA Code', 'Overall Annual Energy Savings (MWh)',
                                            'Overall Peak Demand Savings (MW)',
                                           'State_Total_Energy_Savings', 'State_Total_Peak_Savings']]

energy_efficiency_clean.head()

Unnamed: 0,State,Utility Name,BA Code,Overall Annual Energy Savings (MWh),Overall Peak Demand Savings (MW),State_Total_Energy_Savings,State_Total_Peak_Savings
0,SC,Aiken Electric Coop Inc,SC,Low,Low,Moderate Savings,Moderate Savings
1,AL,PowerSouth Energy Cooperative,SOCO,Moderate,Moderate,Low Savings,Moderate Savings
2,AL,Alabama Power Co,SOCO,Moderate,High,Low Savings,Moderate Savings
3,CA,Alameda Municipal Power,CISO,Low,Low,High Savings,High Savings
4,MN,City of Alexandria - (MN),MISO,Moderate,Moderate,High Savings,High Savings


In [26]:
# Rename columns for more clarity
energy_efficiency_clean = energy_efficiency_clean.rename(columns={'Overall Annual Energy Savings (MWh)': 'Utility Annual Energy Savings',
                                                                  'Overall Peak Demand Savings (MW)':'Utility Peak Demand Savings'})
energy_efficiency_clean.head()

Unnamed: 0,State,Utility Name,BA Code,Utility Annual Energy Savings,Utility Peak Demand Savings,State_Total_Energy_Savings,State_Total_Peak_Savings
0,SC,Aiken Electric Coop Inc,SC,Low,Low,Moderate Savings,Moderate Savings
1,AL,PowerSouth Energy Cooperative,SOCO,Moderate,Moderate,Low Savings,Moderate Savings
2,AL,Alabama Power Co,SOCO,Moderate,High,Low Savings,Moderate Savings
3,CA,Alameda Municipal Power,CISO,Low,Low,High Savings,High Savings
4,MN,City of Alexandria - (MN),MISO,Moderate,Moderate,High Savings,High Savings


In [27]:
energy_efficiency_clean.isna().value_counts()

State  Utility Name  BA Code  Utility Annual Energy Savings  Utility Peak Demand Savings  State_Total_Energy_Savings  State_Total_Peak_Savings
False  False         False    False                          False                        False                       False                       457
Name: count, dtype: int64

In [28]:
len(energy_efficiency_clean['State'].value_counts())

51

In [29]:
# Merge net_gen_reliability and energy_efficiency on the State column
merged_2 = pd.merge(net_gen_reliability, energy_efficiency_clean, on='State', how='outer')

merged_2.head()

Unnamed: 0,State,Total_Generation_Rank,Electric_Generation_Rank,Independent_Generation_Rank,Commercial_Generation_Rank,Industrial_Generation_Rank,Capacity_Growth_Status,Census Division,Percent of Customers Reported w/ Major Events,CAIDI w/ Major Events,...,SAIDI Major Events,SAIFI Major Events,SAIDI No Major Events,SAIFI No Major Events,Utility Name,BA Code,Utility Annual Energy Savings,Utility Peak Demand Savings,State_Total_Energy_Savings,State_Total_Peak_Savings
0,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Moderate Interruption Duration,High Interruption Frequency,Long Interruption Duration,High Interruption Frequency,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings
1,AL,High,High,High,Low,High,Growing,East South Central,80.0,186.9,...,Moderate Interruption Duration,Moderate Interruption Frequency,Moderate Interruption Duration,Moderate Interruption Frequency,PowerSouth Energy Cooperative,SOCO,Moderate,Moderate,Low Savings,Moderate Savings
2,AL,High,High,High,Low,High,Growing,East South Central,80.0,186.9,...,Moderate Interruption Duration,Moderate Interruption Frequency,Moderate Interruption Duration,Moderate Interruption Frequency,Alabama Power Co,SOCO,Moderate,High,Low Savings,Moderate Savings
3,AL,High,High,High,Low,High,Growing,East South Central,80.0,186.9,...,Moderate Interruption Duration,Moderate Interruption Frequency,Moderate Interruption Duration,Moderate Interruption Frequency,Dixie Electric Coop - (AL),SOCO,Low,Moderate,Low Savings,Moderate Savings
4,AL,High,High,High,Low,High,Growing,East South Central,80.0,186.9,...,Moderate Interruption Duration,Moderate Interruption Frequency,Moderate Interruption Duration,Moderate Interruption Frequency,Tennessee Valley Authority,TVA,Moderate,Moderate,Low Savings,Moderate Savings


In [30]:
merged_2[merged_2['BA Code'] == 'Unknown']

Unnamed: 0,State,Total_Generation_Rank,Electric_Generation_Rank,Independent_Generation_Rank,Commercial_Generation_Rank,Industrial_Generation_Rank,Capacity_Growth_Status,Census Division,Percent of Customers Reported w/ Major Events,CAIDI w/ Major Events,...,SAIDI Major Events,SAIFI Major Events,SAIDI No Major Events,SAIFI No Major Events,Utility Name,BA Code,Utility Annual Energy Savings,Utility Peak Demand Savings,State_Total_Energy_Savings,State_Total_Peak_Savings
0,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Moderate Interruption Duration,High Interruption Frequency,Long Interruption Duration,High Interruption Frequency,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings
34,CA,High,High,High,High,High,Shrinking,Pacific Contiguous,100.2,248.1,...,Moderate Interruption Duration,Moderate Interruption Frequency,Long Interruption Duration,Moderate Interruption Frequency,City of Redding - (CA),Unknown,Low,Low,High Savings,High Savings
86,FL,High,High,Moderate,High,High,Shrinking,South Atlantic,100.2,157.7,...,Short Interruption Duration,Low Interruption Frequency,Short Interruption Duration,Low Interruption Frequency,Fort Pierce Utilities Authority,Unknown,Low,Low,Moderate Savings,High Savings
91,FL,High,High,Moderate,High,High,Shrinking,South Atlantic,100.2,157.7,...,Short Interruption Duration,Low Interruption Frequency,Short Interruption Duration,Low Interruption Frequency,Kissimmee Utility Authority,Unknown,Low,High,Moderate Savings,High Savings
116,HI,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,99.9,275.1,...,Long Interruption Duration,Moderate Interruption Frequency,Long Interruption Duration,High Interruption Frequency,Kauai Island Utility Cooperative,Unknown,Moderate,Moderate,Low Savings,Low Savings
117,HI,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,99.9,275.1,...,Long Interruption Duration,Moderate Interruption Frequency,Long Interruption Duration,High Interruption Frequency,Hawaii Energy Efficiency Program,Unknown,High,High,Low Savings,Low Savings


In [31]:
merged_2.isna().value_counts()

State  Total_Generation_Rank  Electric_Generation_Rank  Independent_Generation_Rank  Commercial_Generation_Rank  Industrial_Generation_Rank  Capacity_Growth_Status  Census Division  Percent of Customers Reported w/ Major Events  CAIDI w/ Major Events  CAIDI w/o Major Events  SAIDI Major Events  SAIFI Major Events  SAIDI No Major Events  SAIFI No Major Events  Utility Name  BA Code  Utility Annual Energy Savings  Utility Peak Demand Savings  State_Total_Energy_Savings  State_Total_Peak_Savings
False  False                  False                     False                        False                       False                       False                   False            False                                          False                  False                   False               False               False                  False                  False         False    False                          False                        False                       False                       4

In [32]:
merged_2.isin(['Unknown']).sum()

State                                            0
Total_Generation_Rank                            0
Electric_Generation_Rank                         0
Independent_Generation_Rank                      0
Commercial_Generation_Rank                       0
Industrial_Generation_Rank                       0
Capacity_Growth_Status                           0
Census Division                                  0
Percent of Customers Reported w/ Major Events    0
CAIDI w/ Major Events                            0
CAIDI w/o Major Events                           0
SAIDI Major Events                               0
SAIFI Major Events                               0
SAIDI No Major Events                            0
SAIFI No Major Events                            0
Utility Name                                     0
BA Code                                          6
Utility Annual Energy Savings                    0
Utility Peak Demand Savings                      0
State_Total_Energy_Savings     

### Loading in the regional Demand data for joining

In [33]:
demand = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Regional_Demand_2023.csv', index_col=0)
demand.head()

Unnamed: 0,Region,State,Annual_Region_Demand_MWh
0,CAL,CA,726482.230137
1,CAL,CA,726482.230137
2,CAL,CA,726482.230137
3,CAL,CA,726482.230137
4,CAL,CA,726482.230137


In [34]:
# Merge demand with merged 2

merged_3 = pd.merge(merged_2, demand, on='State', how='outer')
merged_3.head()

Unnamed: 0,State,Total_Generation_Rank,Electric_Generation_Rank,Independent_Generation_Rank,Commercial_Generation_Rank,Industrial_Generation_Rank,Capacity_Growth_Status,Census Division,Percent of Customers Reported w/ Major Events,CAIDI w/ Major Events,...,SAIDI No Major Events,SAIFI No Major Events,Utility Name,BA Code,Utility Annual Energy Savings,Utility Peak Demand Savings,State_Total_Energy_Savings,State_Total_Peak_Savings,Region,Annual_Region_Demand_MWh
0,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Long Interruption Duration,High Interruption Frequency,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247
1,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Long Interruption Duration,High Interruption Frequency,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247
2,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Long Interruption Duration,High Interruption Frequency,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247
3,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Long Interruption Duration,High Interruption Frequency,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247
4,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Long Interruption Duration,High Interruption Frequency,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247


In [35]:
ixp = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_IXP.csv', index_col=0)
ixp.head()

Unnamed: 0,IXP Name,City,State
0,48 IX,Phoenix,AZ
1,ABQIX,Albuquerque,NM
2,AlaskaIX,Anchorage,AK
3,Amateur Radio Internet Exchange - ARIX,Fremont,CA
4,Amateur Radio Internet Exchange - ARIX,Portland,OR


In [36]:
ixp.isna().value_counts()

IXP Name  City   State
False     False  False    196
                 True       3
Name: count, dtype: int64

In [37]:
ixp[ixp['State'].isna()]

Unnamed: 0,IXP Name,City,State
21,BroadBand Internet Exchange US-West - BBIX US-...,West Coast,
32,CoreSite - Any2East,Washington DC,
74,Equinix Internet Exchange Seattle - Equinix Se...,Seattle,


In [38]:
ixp.loc[(ixp['City'] == 'Seattle') & (ixp['State'] == 'Washington'), 'City'] = 'Washington'

In [39]:
# Fix Seattle, Washington data
ixp.loc[ixp['City'] == 'Seattle', 'State'] = 'WA'

# Fix Washington DC data
ixp.loc[ixp['City'] == 'Washington DC', ['City', 'State']] = ['Washington', 'DC']

#Drop rows where City is West Coast
ixp = ixp[ixp['City'] != 'West Coast']

In [40]:
ixp[ixp['City'] == 'Seattle']

Unnamed: 0,IXP Name,City,State
74,Equinix Internet Exchange Seattle - Equinix Se...,Seattle,WA
123,Megaport MegaIX Seattle,Seattle,WA
132,Moe Internet Exchange - Seattle - MoeIX SEA,Seattle,WA
160,Pacific Wave Exchange in LA,Seattle,WA
171,Seattle Internet Exchange (MTU 1500) - SIX Sea...,Seattle,WA
172,Seattle Internet Exchange (MTU 9000) - SIX Sea...,Seattle,WA


In [41]:
ixp[ixp['City'] == 'Washington']

Unnamed: 0,IXP Name,City,State
32,CoreSite - Any2East,Washington,DC
85,Global Peer Exchange,Washington,DC


In [42]:
ixp['State'].value_counts()

State
CA       30
TX       21
VA       13
IL       11
NY       11
FL       10
AZ        8
MO        8
OR        7
WA        7
GA        5
CO        5
OH        5
NV        4
MA        4
NC        4
IA        4
VT        3
UT        3
PA        3
HI        3
MI        3
MN        2
MD        2
DC        2
CT        2
WI        2
AL        2
WV        1
KY        1
NE        1
ME        1
AK        1
TN        1
NM        1
IN        1
NJ        1
KS        1
OK        1
ND        1
Texas     1
MT        1
Name: count, dtype: int64

In [43]:
ixp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 198 entries, 0 to 198
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   IXP Name  198 non-null    object
 1   City      198 non-null    object
 2   State     198 non-null    object
dtypes: object(3)
memory usage: 6.2+ KB


In [44]:
# Merge ixp data
merged_4 = pd.merge(merged_3, ixp, on='State', how='outer')

merged_4.isna().value_counts()

State  Total_Generation_Rank  Electric_Generation_Rank  Independent_Generation_Rank  Commercial_Generation_Rank  Industrial_Generation_Rank  Capacity_Growth_Status  Census Division  Percent of Customers Reported w/ Major Events  CAIDI w/ Major Events  CAIDI w/o Major Events  SAIDI Major Events  SAIFI Major Events  SAIDI No Major Events  SAIFI No Major Events  Utility Name  BA Code  Utility Annual Energy Savings  Utility Peak Demand Savings  State_Total_Energy_Savings  State_Total_Peak_Savings  Region  Annual_Region_Demand_MWh  IXP Name  City 
False  False                  False                     False                        False                       False                       False                   False            False                                          False                  False                   False               False               False                  False                  False         False    False                          False                        False 

In [45]:
merged_4.head(10)

Unnamed: 0,State,Total_Generation_Rank,Electric_Generation_Rank,Independent_Generation_Rank,Commercial_Generation_Rank,Industrial_Generation_Rank,Capacity_Growth_Status,Census Division,Percent of Customers Reported w/ Major Events,CAIDI w/ Major Events,...,Utility Name,BA Code,Utility Annual Energy Savings,Utility Peak Demand Savings,State_Total_Energy_Savings,State_Total_Peak_Savings,Region,Annual_Region_Demand_MWh,IXP Name,City
0,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247,AlaskaIX,Anchorage
1,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247,AlaskaIX,Anchorage
2,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247,AlaskaIX,Anchorage
3,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247,AlaskaIX,Anchorage
4,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247,AlaskaIX,Anchorage
5,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247,AlaskaIX,Anchorage
6,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247,AlaskaIX,Anchorage
7,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247,AlaskaIX,Anchorage
8,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247,AlaskaIX,Anchorage
9,AK,Low,Low,Low,High,Low,Growing,Pacific Noncontiguous,84.5,138.8,...,Golden Valley Elec Assn Inc,Unknown,Low,Low,Low Savings,Low Savings,NW,983874.734247,AlaskaIX,Anchorage


In [46]:
merged_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096948 entries, 0 to 1096947
Data columns (total 25 columns):
 #   Column                                         Non-Null Count    Dtype  
---  ------                                         --------------    -----  
 0   State                                          1096948 non-null  object 
 1   Total_Generation_Rank                          1096947 non-null  object 
 2   Electric_Generation_Rank                       1096947 non-null  object 
 3   Independent_Generation_Rank                    1096947 non-null  object 
 4   Commercial_Generation_Rank                     1096947 non-null  object 
 5   Industrial_Generation_Rank                     1096947 non-null  object 
 6   Capacity_Growth_Status                         1096947 non-null  object 
 7   Census Division                                1096947 non-null  object 
 8   Percent of Customers Reported w/ Major Events  1096947 non-null  float64
 9   CAIDI w/ Major Events   

In [47]:
missing_states = merged_4[merged_4.isna().any(axis=1)]['State'].unique()
missing_states

array(['AR', 'CO', 'DE', 'HI', 'ID', 'KY', 'LA', 'MS', 'MT', 'NH', 'RI',
       'SC', 'SD', 'TN', 'Texas', 'WV', 'WY'], dtype=object)

In [49]:
# Group by state to get IXP count per state
ixp_counts = merged_4.groupby('State')['IXP Name'].nunique().reset_index()
ixp_counts.columns = ['State', 'IXP_Count']

# Merge back the IXP counts
merged_4 = merged_4.merge(ixp_counts, on='State', how='left')

In [50]:
merged_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096948 entries, 0 to 1096947
Data columns (total 27 columns):
 #   Column                                         Non-Null Count    Dtype  
---  ------                                         --------------    -----  
 0   State                                          1096948 non-null  object 
 1   Total_Generation_Rank                          1096947 non-null  object 
 2   Electric_Generation_Rank                       1096947 non-null  object 
 3   Independent_Generation_Rank                    1096947 non-null  object 
 4   Commercial_Generation_Rank                     1096947 non-null  object 
 5   Industrial_Generation_Rank                     1096947 non-null  object 
 6   Capacity_Growth_Status                         1096947 non-null  object 
 7   Census Division                                1096947 non-null  object 
 8   Percent of Customers Reported w/ Major Events  1096947 non-null  float64
 9   CAIDI w/ Major Events   

In [52]:
merged_4['IXP_Count'].value_counts()

IXP_Count
27    350400
18    160965
7     120085
11     99280
4      87600
10     83950
8      49640
3      36141
5      32950
2      32120
1      24107
0      19710
Name: count, dtype: int64

In [53]:
merged_4['IXP Name'] = merged_4['IXP Name'].fillna('No IXP')
merged_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096948 entries, 0 to 1096947
Data columns (total 27 columns):
 #   Column                                         Non-Null Count    Dtype  
---  ------                                         --------------    -----  
 0   State                                          1096948 non-null  object 
 1   Total_Generation_Rank                          1096947 non-null  object 
 2   Electric_Generation_Rank                       1096947 non-null  object 
 3   Independent_Generation_Rank                    1096947 non-null  object 
 4   Commercial_Generation_Rank                     1096947 non-null  object 
 5   Industrial_Generation_Rank                     1096947 non-null  object 
 6   Capacity_Growth_Status                         1096947 non-null  object 
 7   Census Division                                1096947 non-null  object 
 8   Percent of Customers Reported w/ Major Events  1096947 non-null  float64
 9   CAIDI w/ Major Events   

In [56]:
merged_4[merged_4['City'].isna() == True]

Unnamed: 0,State,Total_Generation_Rank,Electric_Generation_Rank,Independent_Generation_Rank,Commercial_Generation_Rank,Industrial_Generation_Rank,Capacity_Growth_Status,Census Division,Percent of Customers Reported w/ Major Events,CAIDI w/ Major Events,...,Utility Annual Energy Savings,Utility Peak Demand Savings,State_Total_Energy_Savings,State_Total_Peak_Savings,Region,Annual_Region_Demand_MWh,IXP Name,City,IXP_Name,IXP_Count
4015,AR,Moderate,Moderate,Low,Low,Moderate,Shrinking,West South Central,89.3,433.4,...,High,High,Moderate Savings,Moderate Savings,SE,628665.969863,No IXP,,No IXP,0
4016,AR,Moderate,Moderate,Low,Low,Moderate,Shrinking,West South Central,89.3,433.4,...,High,High,Moderate Savings,Moderate Savings,SE,628665.969863,No IXP,,No IXP,0
4017,AR,Moderate,Moderate,Low,Low,Moderate,Shrinking,West South Central,89.3,433.4,...,High,High,Moderate Savings,Moderate Savings,SE,628665.969863,No IXP,,No IXP,0
4018,AR,Moderate,Moderate,Low,Low,Moderate,Shrinking,West South Central,89.3,433.4,...,High,High,Moderate Savings,Moderate Savings,SE,628665.969863,No IXP,,No IXP,0
4019,AR,Moderate,Moderate,Low,Low,Moderate,Shrinking,West South Central,89.3,433.4,...,High,High,Moderate Savings,Moderate Savings,SE,628665.969863,No IXP,,No IXP,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096943,WY,Low,Moderate,Low,Low,Moderate,Shrinking,Mountain,75.9,127.8,...,Moderate,Moderate,Low Savings,Low Savings,NW,983874.734247,No IXP,,No IXP,0
1096944,WY,Low,Moderate,Low,Low,Moderate,Shrinking,Mountain,75.9,127.8,...,Moderate,Moderate,Low Savings,Low Savings,NW,983874.734247,No IXP,,No IXP,0
1096945,WY,Low,Moderate,Low,Low,Moderate,Shrinking,Mountain,75.9,127.8,...,Moderate,Moderate,Low Savings,Low Savings,NW,983874.734247,No IXP,,No IXP,0
1096946,WY,Low,Moderate,Low,Low,Moderate,Shrinking,Mountain,75.9,127.8,...,Moderate,Moderate,Low Savings,Low Savings,NW,983874.734247,No IXP,,No IXP,0


There's 19,710 rows with no city level data for IXPs. For now I'll keep them as NaNs until after I've merged all the other city-level data.

### Loading Retail Demand