# Joining the Data

It is now time for the scary part. But we never back away from a challenge! Let's say a prayer and get ready to join the data.

In [1]:
import pandas as pd
import numpy as np

# Merge 1

### Loading in Utility Scale Net Generation Data and Reliability of Distribution Systems Date 

In [2]:
net_gen = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Utility_Scale_Net_Generation.csv', index_col=0)

net_gen.head()



Unnamed: 0,State,Total_Generation_Rank,Electric_Generation_Rank,Commercial_Generation_Rank,Capacity_Growth_Status
0,CT,Moderate,Low,Moderate,Growing
2,MA,Low,Low,High,Growing
4,RI,Low,Low,Low,Shrinking
5,VT,Low,Low,Low,Shrinking
6,NJ,Moderate,Low,High,Shrinking


In [3]:
net_gen['State'].value_counts().sum()

39

In [4]:
reliability = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Reliability_Distribution.csv', index_col=0)

reliability.head()

Unnamed: 0,Census Division,State,Percent of Customers Reported w/ Major Events,SAIDI w/ Major Events,SAIFI w/ Major Events,CAIDI w/ Major Events,SAIDI w/o Major Events,SAIFI w/o Major Events,CAIDI w/o Major Events
0,New England,CT,100.6,164.6,0.9,188.8,70.3,0.7,107.3
1,New England,ME,100.8,1863.0,3.3,562.2,247.4,2.0,125.0
2,New England,MA,88.2,259.4,1.1,245.4,82.2,0.8,105.4
3,New England,NH,99.3,645.8,1.8,368.5,124.6,1.0,126.1
4,New England,RI,98.3,104.6,0.8,133.7,52.2,0.7,78.3


In [5]:
reliability.corr(numeric_only=True)

Unnamed: 0,Percent of Customers Reported w/ Major Events,SAIDI w/ Major Events,SAIFI w/ Major Events,CAIDI w/ Major Events,SAIDI w/o Major Events,SAIFI w/o Major Events,CAIDI w/o Major Events
Percent of Customers Reported w/ Major Events,1.0,0.155055,-0.011626,0.245667,0.005549,-0.077099,0.106543
SAIDI w/ Major Events,0.155055,1.0,0.832496,0.887799,0.65247,0.644317,0.427449
SAIFI w/ Major Events,-0.011626,0.832496,1.0,0.57391,0.859044,0.939123,0.365608
CAIDI w/ Major Events,0.245667,0.887799,0.57391,1.0,0.438757,0.34057,0.490745
SAIDI w/o Major Events,0.005549,0.65247,0.859044,0.438757,1.0,0.91104,0.67892
SAIFI w/o Major Events,-0.077099,0.644317,0.939123,0.34057,0.91104,1.0,0.35766
CAIDI w/o Major Events,0.106543,0.427449,0.365608,0.490745,0.67892,0.35766,1.0


In [6]:
# Drop highly correlated metrics
reliability = reliability.drop(columns=['SAIDI w/ Major Events', 'SAIDI w/o Major Events', 'SAIFI w/o Major Events', 'Percent of Customers Reported w/ Major Events', 'Census Division'])
reliability.head()

Unnamed: 0,State,SAIFI w/ Major Events,CAIDI w/ Major Events,CAIDI w/o Major Events
0,CT,0.9,188.8,107.3
1,ME,3.3,562.2,125.0
2,MA,1.1,245.4,105.4
3,NH,1.8,368.5,126.1
4,RI,0.8,133.7,78.3


In [7]:
reliability.describe()

Unnamed: 0,SAIFI w/ Major Events,CAIDI w/ Major Events,CAIDI w/o Major Events
count,51.0,51.0,51.0
mean,1.386275,226.466667,116.519608
std,0.636873,127.799489,23.595296
min,0.4,76.2,70.8
25%,0.9,134.85,101.35
50%,1.1,186.9,118.0
75%,1.8,282.2,124.95
max,3.3,723.5,206.2


In [8]:
# Bin SAIFI (Average Interruption Frequency Index) w/ Major Events
conditions_saifi = [
    reliability['SAIFI w/ Major Events'] > 1.8,  # 75th percentile
    (reliability['SAIFI w/ Major Events'] >= 1.1) & (reliability['SAIFI w/ Major Events'] <= 1.8),  # 50th-75th
    reliability['SAIFI w/ Major Events'] < 1.1  # Below median
]
choices_saifi = ['High', 'Moderate', 'Low']
reliability['SAIFI Major Events'] = np.select(conditions_saifi, choices_saifi, default='Unknown')

In [9]:
reliability.head()

Unnamed: 0,State,SAIFI w/ Major Events,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events
0,CT,0.9,188.8,107.3,Low
1,ME,3.3,562.2,125.0,High
2,MA,1.1,245.4,105.4,Moderate
3,NH,1.8,368.5,126.1,Moderate
4,RI,0.8,133.7,78.3,Low


In [10]:
reliability = reliability.drop(columns=['SAIFI w/ Major Events'])
reliability.head()

Unnamed: 0,State,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events
0,CT,188.8,107.3,Low
1,ME,562.2,125.0,High
2,MA,245.4,105.4,Moderate
3,NH,368.5,126.1,Moderate
4,RI,133.7,78.3,Low


In [11]:
# Count duplicates of State
reliability['State'].value_counts().sum()

51

In [12]:
# Conduct an outer join on the State column
net_gen_reliability = pd.merge(net_gen, reliability, on='State', how='outer')

net_gen_reliability.head()

Unnamed: 0,State,Total_Generation_Rank,Electric_Generation_Rank,Commercial_Generation_Rank,Capacity_Growth_Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events
0,AK,Low,Low,High,Growing,138.8,137.6,High
1,AL,High,High,Low,Growing,186.9,122.8,Moderate
2,AR,,,,,433.4,144.0,High
3,AZ,High,High,Moderate,Growing,108.6,84.8,Low
4,CA,High,High,High,Shrinking,248.1,140.0,Moderate


In [13]:
net_gen_reliability.isna().value_counts()

State  Total_Generation_Rank  Electric_Generation_Rank  Commercial_Generation_Rank  Capacity_Growth_Status  CAIDI w/ Major Events  CAIDI w/o Major Events  SAIFI Major Events
False  False                  False                     False                       False                   False                  False                   False                 39
       True                   True                      True                        True                    False                  False                   False                 12
Name: count, dtype: int64

In [14]:
net_gen_reliability.fillna('Unknown', inplace=True)
net_gen_reliability.head()

Unnamed: 0,State,Total_Generation_Rank,Electric_Generation_Rank,Commercial_Generation_Rank,Capacity_Growth_Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events
0,AK,Low,Low,High,Growing,138.8,137.6,High
1,AL,High,High,Low,Growing,186.9,122.8,Moderate
2,AR,Unknown,Unknown,Unknown,Unknown,433.4,144.0,High
3,AZ,High,High,Moderate,Growing,108.6,84.8,Low
4,CA,High,High,High,Shrinking,248.1,140.0,Moderate


## Load in the Energy Efficiency data

In [15]:
energy_efficiency = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Efficiency.csv', index_col=0)

energy_efficiency.head()

Unnamed: 0,Utility Number,Utility Name,State,BA Code,Total Annual Savings (MWh),Total Peak Demand Annual Savings (MW),Total Annual Incent Cost (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Weighted Avg Life (Yrs)
2,195,Alabama Power Co,AL,SOCO,5767.0,11.9,1225.0,1095.0,5.0
3,207,Alameda Municipal Power,CA,CISO,146.0,0.1,70.0,321.0,13.269
4,295,City of Alexandria - (MN),MN,MISO,3306.0,0.7,187.0,194.0,12.727
5,554,City of Ames - (IA),IA,MISO,189.0,0.1,197.0,54.0,10.0
6,590,City of Anaheim - (CA),CA,CISO,7242.0,1.5,3338.0,804.0,11.437


In [16]:
energy_efficiency.columns

Index(['Utility Number', 'Utility Name', 'State', 'BA Code',
       'Total Annual Savings (MWh)', 'Total Peak Demand Annual Savings (MW)',
       'Total Annual Incent Cost (Thousand $)',
       'Total Non-Incentive Annual Costs (Thousand $)',
       'Commercial Weighted Avg Life (Yrs)'],
      dtype='object')

In [17]:
energy_efficiency.corr(numeric_only=True)

Unnamed: 0,Utility Number,Total Annual Savings (MWh),Total Peak Demand Annual Savings (MW),Total Annual Incent Cost (Thousand $),Total Non-Incentive Annual Costs (Thousand $),Commercial Weighted Avg Life (Yrs)
Utility Number,1.0,0.021878,0.025283,0.148453,0.083285,0.001019
Total Annual Savings (MWh),0.021878,1.0,0.473292,0.639695,0.812012,0.112121
Total Peak Demand Annual Savings (MW),0.025283,0.473292,1.0,0.278101,0.402326,0.037236
Total Annual Incent Cost (Thousand $),0.148453,0.639695,0.278101,1.0,0.600819,0.024366
Total Non-Incentive Annual Costs (Thousand $),0.083285,0.812012,0.402326,0.600819,1.0,0.018572
Commercial Weighted Avg Life (Yrs),0.001019,0.112121,0.037236,0.024366,0.018572,1.0


In [18]:
# Drop the year and Utility number columns
energy_efficiency = energy_efficiency.drop(columns=['Utility Number', 'Utility Name','Total Non-Incentive Annual Costs (Thousand $)'])

energy_efficiency.head()

Unnamed: 0,State,BA Code,Total Annual Savings (MWh),Total Peak Demand Annual Savings (MW),Total Annual Incent Cost (Thousand $),Commercial Weighted Avg Life (Yrs)
2,AL,SOCO,5767.0,11.9,1225.0,5.0
3,CA,CISO,146.0,0.1,70.0,13.269
4,MN,MISO,3306.0,0.7,187.0,12.727
5,IA,MISO,189.0,0.1,197.0,10.0
6,CA,CISO,7242.0,1.5,3338.0,11.437


In [19]:
energy_efficiency.describe()

Unnamed: 0,Total Annual Savings (MWh),Total Peak Demand Annual Savings (MW),Total Annual Incent Cost (Thousand $),Commercial Weighted Avg Life (Yrs)
count,328.0,328.0,328.0,328.0
mean,72988.2,17.305183,11393.07622,12.505122
std,187904.9,62.95309,32636.461121,4.285681
min,12.0,0.0,0.0,0.0
25%,1214.5,0.2,190.25,11.0
50%,5769.5,1.5,1109.0,12.7
75%,57689.0,11.45,7041.75,14.3125
max,1718664.0,945.0,372401.0,27.525


** Note that MW measures power capacity while MWh measures the amount of electricity delivered overtime

In [20]:
# Bin Total Annual Savings (MWh)
conditions_savings = [
    energy_efficiency['Total Annual Savings (MWh)'] >= energy_efficiency['Total Annual Savings (MWh)'].quantile(0.75),
    (energy_efficiency['Total Annual Savings (MWh)'] >= energy_efficiency['Total Annual Savings (MWh)'].quantile(0.25)) & 
    (energy_efficiency['Total Annual Savings (MWh)'] < energy_efficiency['Total Annual Savings (MWh)'].quantile(0.75)),
    energy_efficiency['Total Annual Savings (MWh)'] < energy_efficiency['Total Annual Savings (MWh)'].quantile(0.25)
]
choices_savings = ['High', 'Moderate', 'Low']
energy_efficiency['Overall Annual Energy Savings (MWh)'] = np.select(conditions_savings, choices_savings, default='Unknown')

# Bin Peak Demand Savings
conditions_peak = [
    energy_efficiency['Total Peak Demand Annual Savings (MW)'] >= energy_efficiency['Total Peak Demand Annual Savings (MW)'].quantile(0.75),
    (energy_efficiency['Total Peak Demand Annual Savings (MW)'] >= energy_efficiency['Total Peak Demand Annual Savings (MW)'].quantile(0.25)) & 
    (energy_efficiency['Total Peak Demand Annual Savings (MW)'] < energy_efficiency['Total Peak Demand Annual Savings (MW)'].quantile(0.75)),
    energy_efficiency['Total Peak Demand Annual Savings (MW)'] < energy_efficiency['Total Peak Demand Annual Savings (MW)'].quantile(0.25)
]
choices_peak = ['High', 'Moderate', 'Low']
energy_efficiency['Overall Peak Demand Savings (MW)'] = np.select(conditions_peak, choices_peak, default='Unknown')

In [21]:
# Bin Total Annual Incent Cost
conditions_incent = [
    energy_efficiency['Total Annual Incent Cost (Thousand $)'] >= energy_efficiency['Total Annual Incent Cost (Thousand $)'].quantile(0.75),
    (energy_efficiency['Total Annual Incent Cost (Thousand $)'] >= energy_efficiency['Total Annual Incent Cost (Thousand $)'].quantile(0.25)) & 
    (energy_efficiency['Total Annual Incent Cost (Thousand $)'] < energy_efficiency['Total Annual Incent Cost (Thousand $)'].quantile(0.75)),
    energy_efficiency['Total Annual Incent Cost (Thousand $)'] < energy_efficiency['Total Annual Incent Cost (Thousand $)'].quantile(0.25)
]
choices_incent = ['High', 'Moderate', 'Low']
energy_efficiency['Overall Annual Incentive Cost'] = np.select(conditions_incent, choices_incent, default='Unknown')

# Bin Commercial Weighted Avg Life
conditions_life = [
    energy_efficiency['Commercial Weighted Avg Life (Yrs)'] >= energy_efficiency['Commercial Weighted Avg Life (Yrs)'].quantile(0.75),
    (energy_efficiency['Commercial Weighted Avg Life (Yrs)'] >= energy_efficiency['Commercial Weighted Avg Life (Yrs)'].quantile(0.25)) & 
    (energy_efficiency['Commercial Weighted Avg Life (Yrs)'] < energy_efficiency['Commercial Weighted Avg Life (Yrs)'].quantile(0.75)),
    energy_efficiency['Commercial Weighted Avg Life (Yrs)'] < energy_efficiency['Commercial Weighted Avg Life (Yrs)'].quantile(0.25)
]
choices_life = ['High', 'Moderate', 'Low']
energy_efficiency['Overall Commercial Avg Life (Yrs)'] = np.select(conditions_life, choices_life, default='Unknown')


In [22]:
energy_efficiency.head()

Unnamed: 0,State,BA Code,Total Annual Savings (MWh),Total Peak Demand Annual Savings (MW),Total Annual Incent Cost (Thousand $),Commercial Weighted Avg Life (Yrs),Overall Annual Energy Savings (MWh),Overall Peak Demand Savings (MW),Overall Annual Incentive Cost,Overall Commercial Avg Life (Yrs)
2,AL,SOCO,5767.0,11.9,1225.0,5.0,Moderate,High,Moderate,Low
3,CA,CISO,146.0,0.1,70.0,13.269,Low,Low,Low,Moderate
4,MN,MISO,3306.0,0.7,187.0,12.727,Moderate,Moderate,Low,Moderate
5,IA,MISO,189.0,0.1,197.0,10.0,Low,Low,Moderate,Low
6,CA,CISO,7242.0,1.5,3338.0,11.437,Moderate,Moderate,Moderate,Moderate


Let's aggregate some features to simplify the dataframe

In [23]:
# Keep only relevant columns
energy_efficiency_clean = energy_efficiency[['State', 'BA Code', 'Overall Annual Energy Savings (MWh)',
                                            'Overall Peak Demand Savings (MW)', 'Overall Annual Incentive Cost', 'Overall Commercial Avg Life (Yrs)']]

energy_efficiency_clean.head()

Unnamed: 0,State,BA Code,Overall Annual Energy Savings (MWh),Overall Peak Demand Savings (MW),Overall Annual Incentive Cost,Overall Commercial Avg Life (Yrs)
2,AL,SOCO,Moderate,High,Moderate,Low
3,CA,CISO,Low,Low,Low,Moderate
4,MN,MISO,Moderate,Moderate,Low,Moderate
5,IA,MISO,Low,Low,Moderate,Low
6,CA,CISO,Moderate,Moderate,Moderate,Moderate


In [24]:
# Rename columns for more clarity
energy_efficiency_clean = energy_efficiency_clean.rename(columns={'Overall Annual Energy Savings (MWh)': 'Utility Annual Energy Savings (MWh)',
                                                                  'Overall Peak Demand Savings (MW)':'Utility Peak Demand Savings (MW)',
                                                                  'Overall Annual Incentive Cost': 'Utility Annual Incentive Cost ($)',
                                                                  'Overall Commercial Avg Life (Yrs)': 'Utility Commercial Avg Life (Yrs)'})
energy_efficiency_clean.head()

Unnamed: 0,State,BA Code,Utility Annual Energy Savings (MWh),Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs)
2,AL,SOCO,Moderate,High,Moderate,Low
3,CA,CISO,Low,Low,Low,Moderate
4,MN,MISO,Moderate,Moderate,Low,Moderate
5,IA,MISO,Low,Low,Moderate,Low
6,CA,CISO,Moderate,Moderate,Moderate,Moderate


In [25]:
energy_efficiency_clean.isna().value_counts()

State  BA Code  Utility Annual Energy Savings (MWh)  Utility Peak Demand Savings (MW)  Utility Annual Incentive Cost ($)  Utility Commercial Avg Life (Yrs)
False  False    False                                False                             False                              False                                328
Name: count, dtype: int64

In [26]:
len(energy_efficiency_clean['State'].value_counts())

48

In [27]:
# Merge net_gen_reliability and energy_efficiency on the State column
merged_2 = pd.merge(net_gen_reliability, energy_efficiency_clean, on='State', how='outer')

merged_2.head()

Unnamed: 0,State,Total_Generation_Rank,Electric_Generation_Rank,Commercial_Generation_Rank,Capacity_Growth_Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,Utility Annual Energy Savings (MWh),Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs)
0,AK,Low,Low,High,Growing,138.8,137.6,High,,,,,
1,AL,High,High,Low,Growing,186.9,122.8,Moderate,SOCO,Moderate,High,Moderate,Low
2,AL,High,High,Low,Growing,186.9,122.8,Moderate,TVA,Moderate,Moderate,Moderate,Moderate
3,AR,Unknown,Unknown,Unknown,Unknown,433.4,144.0,High,MISO,High,High,High,High
4,AR,Unknown,Unknown,Unknown,Unknown,433.4,144.0,High,SWPP,Moderate,Moderate,Moderate,Moderate


In [28]:
merged_2.rename(columns={'Total_Generation_Rank': 'Total Generation Rank', 'Electric_Generation_Rank': 'Electric Generation Rank', 
                         'Commercial_Generation_Rank': 'Commercial Generation Rank','Capacity_Growth_Status': 'Capacity Growth Status'}, inplace=True)

In [29]:
merged_2 = merged_2.dropna(subset=['BA Code'])

In [30]:
merged_2.isna().value_counts()

State  Total Generation Rank  Electric Generation Rank  Commercial Generation Rank  Capacity Growth Status  CAIDI w/ Major Events  CAIDI w/o Major Events  SAIFI Major Events  BA Code  Utility Annual Energy Savings (MWh)  Utility Peak Demand Savings (MW)  Utility Annual Incentive Cost ($)  Utility Commercial Avg Life (Yrs)
False  False                  False                     False                       False                   False                  False                   False               False    False                                False                             False                              False                                328
Name: count, dtype: int64

In [31]:
merged_2.isin(['Unknown']).sum()

State                                   0
Total Generation Rank                  51
Electric Generation Rank               51
Commercial Generation Rank             51
Capacity Growth Status                 51
CAIDI w/ Major Events                   0
CAIDI w/o Major Events                  0
SAIFI Major Events                      0
BA Code                                 3
Utility Annual Energy Savings (MWh)     0
Utility Peak Demand Savings (MW)        0
Utility Annual Incentive Cost ($)       0
Utility Commercial Avg Life (Yrs)       0
dtype: int64

### Loading in the regional Demand data for joining

In [32]:
demand = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Regional_Demand_2023.csv', index_col=0)
demand.head()

Unnamed: 0,Region,State,Annual_Region_Demand_MWh
0,CAL,CA,726482.230137
1,CAL,CA,726482.230137
2,CAL,CA,726482.230137
3,CAL,CA,726482.230137
4,CAL,CA,726482.230137


In [33]:
demand.describe()

Unnamed: 0,Annual_Region_Demand_MWh
count,16790.0
mean,1029680.0
std,663772.3
min,306856.5
25%,598428.2
50%,769893.6
75%,1920122.0
max,2150311.0


In [34]:
# Bin Annual Region Demand
conditions_demand = [
    demand['Annual_Region_Demand_MWh'] >= demand['Annual_Region_Demand_MWh'].quantile(0.75),
    (demand['Annual_Region_Demand_MWh'] >= demand['Annual_Region_Demand_MWh'].quantile(0.25)) & 
    (demand['Annual_Region_Demand_MWh'] < demand['Annual_Region_Demand_MWh'].quantile(0.75)),
    demand['Annual_Region_Demand_MWh'] < demand['Annual_Region_Demand_MWh'].quantile(0.25)
]
choices_demand = ['High', 'Moderate', 'Low']
demand['Regional Electricity Demand'] = np.select(conditions_demand, choices_demand, default='Unknown')
demand.head()

Unnamed: 0,Region,State,Annual_Region_Demand_MWh,Regional Electricity Demand
0,CAL,CA,726482.230137,Moderate
1,CAL,CA,726482.230137,Moderate
2,CAL,CA,726482.230137,Moderate
3,CAL,CA,726482.230137,Moderate
4,CAL,CA,726482.230137,Moderate


In [35]:
demand = demand.drop(columns=['Annual_Region_Demand_MWh'])

In [36]:
demand.shape

(16790, 3)

In [37]:
demand.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16790 entries, 0 to 16789
Data columns (total 3 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Region                       16790 non-null  object
 1   State                        16790 non-null  object
 2   Regional Electricity Demand  16790 non-null  object
dtypes: object(3)
memory usage: 524.7+ KB


In [38]:
# Drop duplicates of region and state
demand = demand.drop_duplicates(subset=['Region', 'State'])

# Verify the result
print(demand.shape)
demand.info()

(46, 3)
<class 'pandas.core.frame.DataFrame'>
Index: 46 entries, 0 to 16425
Data columns (total 3 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Region                       46 non-null     object
 1   State                        46 non-null     object
 2   Regional Electricity Demand  46 non-null     object
dtypes: object(3)
memory usage: 1.4+ KB


In [39]:
# Merge demand with merged 2

merged_3 = pd.merge(merged_2, demand, on='State', how='outer')
merged_3.head()

Unnamed: 0,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,Utility Annual Energy Savings (MWh),Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs),Region,Regional Electricity Demand
0,AK,,,,,,,,,,,,,NW,Moderate
1,AL,High,High,Low,Growing,186.9,122.8,Moderate,SOCO,Moderate,High,Moderate,Low,SE,Moderate
2,AL,High,High,Low,Growing,186.9,122.8,Moderate,TVA,Moderate,Moderate,Moderate,Moderate,SE,Moderate
3,AR,Unknown,Unknown,Unknown,Unknown,433.4,144.0,High,MISO,High,High,High,High,SE,Moderate
4,AR,Unknown,Unknown,Unknown,Unknown,433.4,144.0,High,SWPP,Moderate,Moderate,Moderate,Moderate,SE,Moderate


In [40]:
merged_3[merged_3['Regional Electricity Demand'].isna()]

Unnamed: 0,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,Utility Annual Energy Savings (MWh),Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs),Region,Regional Electricity Demand
37,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Low,Moderate,Moderate,High,,
38,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Moderate,Moderate,Moderate,High,,
39,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,PSCO,Moderate,Moderate,Moderate,Low,,
40,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,PSCO,Moderate,Moderate,Moderate,Moderate,,
41,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Moderate,Moderate,Moderate,Moderate,,
42,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,PSCO,Moderate,Moderate,Moderate,Moderate,,
43,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,PSCO,Moderate,Moderate,Moderate,Moderate,,
44,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Low,Low,Low,Moderate,,
45,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Moderate,Moderate,Moderate,Moderate,,
46,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Moderate,Moderate,Moderate,High,,


In [41]:
merged_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                338 non-null    object 
 1   Total Generation Rank                336 non-null    object 
 2   Electric Generation Rank             336 non-null    object 
 3   Commercial Generation Rank           336 non-null    object 
 4   Capacity Growth Status               336 non-null    object 
 5   CAIDI w/ Major Events                336 non-null    float64
 6   CAIDI w/o Major Events               336 non-null    float64
 7   SAIFI Major Events                   336 non-null    object 
 8   BA Code                              336 non-null    object 
 9   Utility Annual Energy Savings (MWh)  336 non-null    object 
 10  Utility Peak Demand Savings (MW)     336 non-null    object 
 11  Utility Annual Incentive Cost ($

In [42]:
merged_3[merged_3['Total Generation Rank'].isna()]

Unnamed: 0,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,Utility Annual Energy Savings (MWh),Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs),Region,Regional Electricity Demand
0,AK,,,,,,,,,,,,,NW,Moderate
110,KS,,,,,,,,,,,,,CENT,Moderate


In [43]:
merged_3 = merged_3.dropna(subset=['Total Generation Rank'])
merged_3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 336 entries, 1 to 337
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                336 non-null    object 
 1   Total Generation Rank                336 non-null    object 
 2   Electric Generation Rank             336 non-null    object 
 3   Commercial Generation Rank           336 non-null    object 
 4   Capacity Growth Status               336 non-null    object 
 5   CAIDI w/ Major Events                336 non-null    float64
 6   CAIDI w/o Major Events               336 non-null    float64
 7   SAIFI Major Events                   336 non-null    object 
 8   BA Code                              336 non-null    object 
 9   Utility Annual Energy Savings (MWh)  336 non-null    object 
 10  Utility Peak Demand Savings (MW)     336 non-null    object 
 11  Utility Annual Incentive Cost ($)    

In [44]:
merged_3['Region'].value_counts()

Region
MIDW    78
NW      49
CENT    31
CAL     26
MIDA    24
NE      21
SE      19
SW      14
FLA     14
TEX     14
CAR     13
NY      10
Name: count, dtype: int64

In [45]:
merged_3[merged_3['Region'].isna()]

Unnamed: 0,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,Utility Annual Energy Savings (MWh),Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs),Region,Regional Electricity Demand
37,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Low,Moderate,Moderate,High,,
38,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Moderate,Moderate,Moderate,High,,
39,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,PSCO,Moderate,Moderate,Moderate,Low,,
40,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,PSCO,Moderate,Moderate,Moderate,Moderate,,
41,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Moderate,Moderate,Moderate,Moderate,,
42,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,PSCO,Moderate,Moderate,Moderate,Moderate,,
43,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,PSCO,Moderate,Moderate,Moderate,Moderate,,
44,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Low,Low,Low,Moderate,,
45,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Moderate,Moderate,Moderate,Moderate,,
46,CO,Moderate,Moderate,Low,Shrinking,108.3,102.2,Low,WACM,Moderate,Moderate,Moderate,High,,


In [46]:
# If State is CO or MT then Region is NW
merged_3.loc[(merged_3['State'] == 'CO') | (merged_3['State'] == 'MT'), 'Region'] = 'NW'

# If State is KY then region is CENT
merged_3.loc[merged_3['State'] == 'KY', 'Region'] = 'CENT'

# If State is TN then region is SE
merged_3.loc[merged_3['State'] == 'TN', 'Region'] = 'SE'


In [47]:
merged_3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 336 entries, 1 to 337
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                336 non-null    object 
 1   Total Generation Rank                336 non-null    object 
 2   Electric Generation Rank             336 non-null    object 
 3   Commercial Generation Rank           336 non-null    object 
 4   Capacity Growth Status               336 non-null    object 
 5   CAIDI w/ Major Events                336 non-null    float64
 6   CAIDI w/o Major Events               336 non-null    float64
 7   SAIFI Major Events                   336 non-null    object 
 8   BA Code                              336 non-null    object 
 9   Utility Annual Energy Savings (MWh)  336 non-null    object 
 10  Utility Peak Demand Savings (MW)     336 non-null    object 
 11  Utility Annual Incentive Cost ($)    

In [48]:
ixp = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_IXP.csv', index_col=0)
ixp.head()

Unnamed: 0,IXP Name,City,State
0,48 IX,Phoenix,AZ
1,ABQIX,Albuquerque,NM
2,AlaskaIX,Anchorage,AK
3,Amateur Radio Internet Exchange - ARIX,Fremont,CA
4,Amateur Radio Internet Exchange - ARIX,Portland,OR


In [49]:
ixp.isna().value_counts()

IXP Name  City   State
False     False  False    196
                 True       3
Name: count, dtype: int64

In [50]:
ixp[ixp['State'].isna()]

Unnamed: 0,IXP Name,City,State
21,BroadBand Internet Exchange US-West - BBIX US-...,West Coast,
32,CoreSite - Any2East,Washington DC,
74,Equinix Internet Exchange Seattle - Equinix Se...,Seattle,


In [51]:
ixp.loc[(ixp['City'] == 'Seattle') & (ixp['State'] == 'Washington'), 'City'] = 'Washington'

In [52]:
# Fix Seattle, Washington data
ixp.loc[ixp['City'] == 'Seattle', 'State'] = 'WA'

# Fix Washington DC data
ixp.loc[ixp['City'] == 'Washington DC', ['City', 'State']] = ['Washington', 'DC']

#Drop rows where City is West Coast
ixp = ixp[ixp['City'] != 'West Coast']

In [53]:
ixp[ixp['City'] == 'Seattle']

Unnamed: 0,IXP Name,City,State
74,Equinix Internet Exchange Seattle - Equinix Se...,Seattle,WA
123,Megaport MegaIX Seattle,Seattle,WA
132,Moe Internet Exchange - Seattle - MoeIX SEA,Seattle,WA
160,Pacific Wave Exchange in LA,Seattle,WA
171,Seattle Internet Exchange (MTU 1500) - SIX Sea...,Seattle,WA
172,Seattle Internet Exchange (MTU 9000) - SIX Sea...,Seattle,WA


In [54]:
ixp[ixp['City'] == 'Washington']

Unnamed: 0,IXP Name,City,State
32,CoreSite - Any2East,Washington,DC
85,Global Peer Exchange,Washington,DC


In [55]:
len(ixp['State'].value_counts())

42

In [56]:
len(ixp['City'].value_counts())

92

In [57]:
ixp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 198 entries, 0 to 198
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   IXP Name  198 non-null    object
 1   City      198 non-null    object
 2   State     198 non-null    object
dtypes: object(3)
memory usage: 6.2+ KB


In [58]:
# Merge ixp data
merged_4 = pd.merge(merged_3, ixp, on='State', how='outer')

merged_4.isna().value_counts()

State  Total Generation Rank  Electric Generation Rank  Commercial Generation Rank  Capacity Growth Status  CAIDI w/ Major Events  CAIDI w/o Major Events  SAIFI Major Events  BA Code  Utility Annual Energy Savings (MWh)  Utility Peak Demand Savings (MW)  Utility Annual Incentive Cost ($)  Utility Commercial Avg Life (Yrs)  Region  Regional Electricity Demand  IXP Name  City 
False  False                  False                     False                       False                   False                  False                   False               False    False                                False                             False                              False                              False   False                        False     False    2169
                                                                                                                                                                                                                                            

In [59]:
merged_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2291 entries, 0 to 2290
Data columns (total 17 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                2291 non-null   object 
 1   Total Generation Rank                2287 non-null   object 
 2   Electric Generation Rank             2287 non-null   object 
 3   Commercial Generation Rank           2287 non-null   object 
 4   Capacity Growth Status               2287 non-null   object 
 5   CAIDI w/ Major Events                2287 non-null   float64
 6   CAIDI w/o Major Events               2287 non-null   float64
 7   SAIFI Major Events                   2287 non-null   object 
 8   BA Code                              2287 non-null   object 
 9   Utility Annual Energy Savings (MWh)  2287 non-null   object 
 10  Utility Peak Demand Savings (MW)     2287 non-null   object 
 11  Utility Annual Incentive Cost 

In [60]:
merged_4[merged_4['City'].isna()]

Unnamed: 0,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,Utility Annual Energy Savings (MWh),Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs),Region,Regional Electricity Demand,IXP Name,City
5,AR,Unknown,Unknown,Unknown,Unknown,433.4,144.0,High,MISO,High,High,High,High,SE,Moderate,,
6,AR,Unknown,Unknown,Unknown,Unknown,433.4,144.0,High,SWPP,Moderate,Moderate,Moderate,Moderate,SE,Moderate,,
7,AR,Unknown,Unknown,Unknown,Unknown,433.4,144.0,High,SWPP,Moderate,High,Moderate,High,SE,Moderate,,
903,DE,Unknown,Unknown,Unknown,Unknown,130.3,87.3,Low,PJM,Low,High,Moderate,Low,MIDA,High,,
904,DE,Unknown,Unknown,Unknown,Unknown,130.3,87.3,Low,PJM,Low,Low,Low,High,MIDA,High,,
1129,ID,Low,Moderate,Low,Shrinking,128.4,121.2,Moderate,BPAT,Low,Low,Low,Low,NW,Moderate,,
1130,ID,Low,Moderate,Low,Shrinking,128.4,121.2,Moderate,PACE,Moderate,Moderate,Moderate,Moderate,NW,Moderate,,
1131,ID,Low,Moderate,Low,Shrinking,128.4,121.2,Moderate,IPCO,High,High,High,Moderate,NW,Moderate,,
1132,ID,Low,Moderate,Low,Shrinking,128.4,121.2,Moderate,AVA,Low,Low,Low,Moderate,NW,Moderate,,
1133,ID,Low,Moderate,Low,Shrinking,128.4,121.2,Moderate,PACE,Moderate,Moderate,Moderate,Moderate,NW,Moderate,,


In [61]:
merged_4['IXP Name'] = merged_4['IXP Name'].fillna('No IXP')

In [62]:
merged_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2291 entries, 0 to 2290
Data columns (total 17 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                2291 non-null   object 
 1   Total Generation Rank                2287 non-null   object 
 2   Electric Generation Rank             2287 non-null   object 
 3   Commercial Generation Rank           2287 non-null   object 
 4   Capacity Growth Status               2287 non-null   object 
 5   CAIDI w/ Major Events                2287 non-null   float64
 6   CAIDI w/o Major Events               2287 non-null   float64
 7   SAIFI Major Events                   2287 non-null   object 
 8   BA Code                              2287 non-null   object 
 9   Utility Annual Energy Savings (MWh)  2287 non-null   object 
 10  Utility Peak Demand Savings (MW)     2287 non-null   object 
 11  Utility Annual Incentive Cost 

In [63]:
merged_4[merged_4['Total Generation Rank'].isna()]

Unnamed: 0,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,Utility Annual Energy Savings (MWh),Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs),Region,Regional Electricity Demand,IXP Name,City
0,AK,,,,,,,,,,,,,,,AlaskaIX,Anchorage
1198,KS,,,,,,,,,,,,,,,BGP.Exchange - Kansas,Kansas City
2099,Texas,,,,,,,,,,,,,,,DartNode IXP - DartNode IXP - Houston,Houston
2287,WV,,,,,,,,,,,,,,,West Virginia Internet Exchange,South Charleston


In [64]:
# Drop rows where Total Generation Rank is NaN
merged_4 = merged_4.dropna(subset=['Total Generation Rank'])
merged_4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2287 entries, 1 to 2290
Data columns (total 17 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                2287 non-null   object 
 1   Total Generation Rank                2287 non-null   object 
 2   Electric Generation Rank             2287 non-null   object 
 3   Commercial Generation Rank           2287 non-null   object 
 4   Capacity Growth Status               2287 non-null   object 
 5   CAIDI w/ Major Events                2287 non-null   float64
 6   CAIDI w/o Major Events               2287 non-null   float64
 7   SAIFI Major Events                   2287 non-null   object 
 8   BA Code                              2287 non-null   object 
 9   Utility Annual Energy Savings (MWh)  2287 non-null   object 
 10  Utility Peak Demand Savings (MW)     2287 non-null   object 
 11  Utility Annual Incentive Cost ($)  

In [65]:
merged_4.loc[merged_4['State'] == 'Texas', 'State'] = 'TX'

merged_4['State'].value_counts()

State
CA    780
TX    294
FL    140
WA    119
NY    110
OR    105
CO     65
MO     64
MA     64
MI     57
IL     55
IA     48
MN     42
AZ     40
VA     39
NC     32
GA     30
OH     30
PA     24
NE     14
WI     14
MD     10
UT      9
NV      8
IN      8
CT      8
VT      6
SD      6
ID      6
HI      6
NJ      5
SC      5
KY      5
NH      4
NM      4
MS      4
LA      4
AL      4
WY      3
AR      3
RI      2
OK      2
DE      2
DC      2
MT      2
TN      1
ND      1
ME      1
Name: count, dtype: int64

In [66]:
merged_4[merged_4['City'].isna()]

Unnamed: 0,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,Utility Annual Energy Savings (MWh),Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs),Region,Regional Electricity Demand,IXP Name,City
5,AR,Unknown,Unknown,Unknown,Unknown,433.4,144.0,High,MISO,High,High,High,High,SE,Moderate,No IXP,
6,AR,Unknown,Unknown,Unknown,Unknown,433.4,144.0,High,SWPP,Moderate,Moderate,Moderate,Moderate,SE,Moderate,No IXP,
7,AR,Unknown,Unknown,Unknown,Unknown,433.4,144.0,High,SWPP,Moderate,High,Moderate,High,SE,Moderate,No IXP,
903,DE,Unknown,Unknown,Unknown,Unknown,130.3,87.3,Low,PJM,Low,High,Moderate,Low,MIDA,High,No IXP,
904,DE,Unknown,Unknown,Unknown,Unknown,130.3,87.3,Low,PJM,Low,Low,Low,High,MIDA,High,No IXP,
1129,ID,Low,Moderate,Low,Shrinking,128.4,121.2,Moderate,BPAT,Low,Low,Low,Low,NW,Moderate,No IXP,
1130,ID,Low,Moderate,Low,Shrinking,128.4,121.2,Moderate,PACE,Moderate,Moderate,Moderate,Moderate,NW,Moderate,No IXP,
1131,ID,Low,Moderate,Low,Shrinking,128.4,121.2,Moderate,IPCO,High,High,High,Moderate,NW,Moderate,No IXP,
1132,ID,Low,Moderate,Low,Shrinking,128.4,121.2,Moderate,AVA,Low,Low,Low,Moderate,NW,Moderate,No IXP,
1133,ID,Low,Moderate,Low,Shrinking,128.4,121.2,Moderate,PACE,Moderate,Moderate,Moderate,Moderate,NW,Moderate,No IXP,


In [67]:
len(merged_4[merged_4['City'].isna()])

39

In [68]:
merged_4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2287 entries, 1 to 2290
Data columns (total 17 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                2287 non-null   object 
 1   Total Generation Rank                2287 non-null   object 
 2   Electric Generation Rank             2287 non-null   object 
 3   Commercial Generation Rank           2287 non-null   object 
 4   Capacity Growth Status               2287 non-null   object 
 5   CAIDI w/ Major Events                2287 non-null   float64
 6   CAIDI w/o Major Events               2287 non-null   float64
 7   SAIFI Major Events                   2287 non-null   object 
 8   BA Code                              2287 non-null   object 
 9   Utility Annual Energy Savings (MWh)  2287 non-null   object 
 10  Utility Peak Demand Savings (MW)     2287 non-null   object 
 11  Utility Annual Incentive Cost ($)  

In [69]:
# Replace 'No IXP' with NaN to exclude it from the count
merged_4['IXP Name'] = merged_4['IXP Name'].replace('No IXP', np.nan)

# Group by 'State' and count unique 'IXP Name' values, then reset index
ixp_counts = merged_4.groupby('State')['IXP Name'].nunique().reset_index()

# Rename the column to 'IXP_Count'
ixp_counts = ixp_counts.rename(columns={'IXP Name': 'IXP_Count'})

# Merge the counts back into the original DataFrame
merged_4 = merged_4.merge(ixp_counts, on='State', how='left')

# Fill NaN values in 'IXP Name' back with 'No IXP'
merged_4['IXP Name'] = merged_4['IXP Name'].fillna('No IXP')

In [70]:
merged_4['IXP_Count'].value_counts()

IXP_Count
27    780
18    294
7     224
11    204
4     152
10    140
5     125
8     104
3      96
2      72
1      57
0      39
Name: count, dtype: int64

In [71]:
merged_4[merged_4['Region'].isna()]

Unnamed: 0,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,Utility Annual Energy Savings (MWh),Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs),Region,Regional Electricity Demand,IXP Name,City,IXP_Count
1074,HI,Low,Low,High,Growing,275.1,106.3,Moderate,Unknown,Moderate,Moderate,Moderate,Moderate,,,DRFortress Exchange - DRF IX,Honolulu,3
1075,HI,Low,Low,High,Growing,275.1,106.3,Moderate,Unknown,Moderate,Moderate,Moderate,Moderate,,,Hawai`i Internet Exchange - HIX,Honolulu,3
1076,HI,Low,Low,High,Growing,275.1,106.3,Moderate,Unknown,Moderate,Moderate,Moderate,Moderate,,,Hawaii Pacific Teleport IX-1 - HPTI-IX,Honolulu,3
1077,HI,Low,Low,High,Growing,275.1,106.3,Moderate,Unknown,High,Moderate,High,Moderate,,,DRFortress Exchange - DRF IX,Honolulu,3
1078,HI,Low,Low,High,Growing,275.1,106.3,Moderate,Unknown,High,Moderate,High,Moderate,,,Hawai`i Internet Exchange - HIX,Honolulu,3
1079,HI,Low,Low,High,Growing,275.1,106.3,Moderate,Unknown,High,Moderate,High,Moderate,,,Hawaii Pacific Teleport IX-1 - HPTI-IX,Honolulu,3


In [72]:
merged_4['Region'] = merged_4['Region'].fillna('Independent')
merged_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2287 entries, 0 to 2286
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                2287 non-null   object 
 1   Total Generation Rank                2287 non-null   object 
 2   Electric Generation Rank             2287 non-null   object 
 3   Commercial Generation Rank           2287 non-null   object 
 4   Capacity Growth Status               2287 non-null   object 
 5   CAIDI w/ Major Events                2287 non-null   float64
 6   CAIDI w/o Major Events               2287 non-null   float64
 7   SAIFI Major Events                   2287 non-null   object 
 8   BA Code                              2287 non-null   object 
 9   Utility Annual Energy Savings (MWh)  2287 non-null   object 
 10  Utility Peak Demand Savings (MW)     2287 non-null   object 
 11  Utility Annual Incentive Cost 

In [73]:
len(merged_4[merged_4['City'].isna() == True])

39

###  Leaving the missing city data for now

In [74]:
# Move City column to beginning of dataframe

cols = merged_4.columns.tolist()

# Remove 'City' from current position and add to beginning
cols.remove('City')
cols = ['City'] + cols

# Reorder the DataFrame
merged_4 = merged_4[cols]

# Verify the new column order
print(merged_4.columns)

Index(['City', 'State', 'Total Generation Rank', 'Electric Generation Rank',
       'Commercial Generation Rank', 'Capacity Growth Status',
       'CAIDI w/ Major Events', 'CAIDI w/o Major Events', 'SAIFI Major Events',
       'BA Code', 'Utility Annual Energy Savings (MWh)',
       'Utility Peak Demand Savings (MW)', 'Utility Annual Incentive Cost ($)',
       'Utility Commercial Avg Life (Yrs)', 'Region',
       'Regional Electricity Demand', 'IXP Name', 'IXP_Count'],
      dtype='object')


### Loading Retail Costs

In [75]:
retail_costs = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Retail_Costs.csv', index_col=0)
retail_costs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1218 entries, 0 to 1238
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   NAME            1218 non-null   object
 1   CITY            1218 non-null   object
 2   STATE           1218 non-null   object
 3   Total_MWh_Rank  1218 non-null   object
 4   Purchased_Rank  1218 non-null   object
dtypes: object(5)
memory usage: 57.1+ KB


In [76]:
retail_costs.head()

Unnamed: 0,NAME,CITY,STATE,Total_MWh_Rank,Purchased_Rank
0,EVERGY METRO,KANSAS CITY,KS,High,High
1,"EVERGY KANSAS SOUTH, INC",TOPEKA,KS,High,High
2,KARNES ELECTRIC COOP INC,KARNES CITY,TX,High,High
3,KAY ELECTRIC COOP,BLACKWELL,OK,Moderate,Moderate
4,FREESTATE ELECTRIC COOP,MCLOUTH,KS,Low,Low


In [77]:
retail_costs = retail_costs.rename(columns={'STATE': 'State', 'NAME': 'Name', 'CITY':'City',
                                'Purchased_Rank': 'Purchased Rank', 'Total_MWh_Rank': 'Total MWh Rank'})
retail_costs['City'].value_counts()

City
COLUMBUS          12
JACKSON            6
AKRON              6
MADISON            5
LEBANON            5
                  ..
MONTE VISTA        1
KINGSTREE          1
SAN DIEGO          1
MOUNT PLEASANT     1
COLDWATER          1
Name: count, Length: 994, dtype: int64

In [78]:
# Check number of states in retail
len(retail_costs['State'].value_counts())

52

In [79]:
retail_costs['State'].value_counts()

# Drop US territories
retail_costs = retail_costs[~retail_costs['State'].isin(['PR', 'GU', 'VI', 'AS', 'CNMI'])]
retail_costs['State'].value_counts()

State
TN    82
TX    75
WI    64
GA    55
NC    47
OH    47
AL    43
IN    43
MN    41
KY    41
MO    41
MS    40
FL    37
WA    37
CA    30
CO    29
OK    29
SC    28
MI    28
AR    26
IL    22
KS    21
LA    21
MA    21
IA    21
NE    20
OR    20
PA    20
AZ    17
VA    17
NY    16
ND    15
SD    15
NM    14
UT    12
ID     9
NV     9
AK     7
MT     7
DE     6
WY     6
CT     6
MD     5
ME     5
NH     5
HI     4
NJ     4
VT     4
RI     3
DC     1
WV     1
Name: count, dtype: int64

In [80]:
len(retail_costs['State'].value_counts())

51

In [81]:
# Check number of states in merged 4

len(merged_4['State'].value_counts())

48

In [82]:
#Lowercase City column
retail_costs['City'] = retail_costs['City'].str.title()
retail_costs['City'].value_counts()

City
Columbus          12
Jackson            6
Akron              6
Madison            5
Lebanon            5
                  ..
Kingstree          1
San Diego          1
Mount Pleasant     1
Livingston         1
Coldwater          1
Name: count, Length: 993, dtype: int64

In [83]:
retail_costs

Unnamed: 0,Name,City,State,Total MWh Rank,Purchased Rank
0,EVERGY METRO,Kansas City,KS,High,High
1,"EVERGY KANSAS SOUTH, INC",Topeka,KS,High,High
2,KARNES ELECTRIC COOP INC,Karnes City,TX,High,High
3,KAY ELECTRIC COOP,Blackwell,OK,Moderate,Moderate
4,FREESTATE ELECTRIC COOP,Mclouth,KS,Low,Low
...,...,...,...,...,...
1234,CLYDE LIGHT & POWER,Clyde,OH,Low,Low
1235,CITY OF FLORESVILLE,Floresville,TX,Moderate,Moderate
1236,CITY OF NATCHITOCHES,Natchitoches,LA,Low,Low
1237,SHAWANO MUNICIPAL UTILITIES,Shawano,WI,Low,Low


In [84]:
# Merge on both State and City
merged_5 = pd.merge(
    merged_4,
    retail_costs,
    on=['State', 'City'],
    how='outer'
)

merged_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3847 entries, 0 to 3846
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 3808 non-null   object 
 1   State                                3847 non-null   object 
 2   Total Generation Rank                2709 non-null   object 
 3   Electric Generation Rank             2709 non-null   object 
 4   Commercial Generation Rank           2709 non-null   object 
 5   Capacity Growth Status               2709 non-null   object 
 6   CAIDI w/ Major Events                2709 non-null   float64
 7   CAIDI w/o Major Events               2709 non-null   float64
 8   SAIFI Major Events                   2709 non-null   object 
 9   BA Code                              2709 non-null   object 
 10  Utility Annual Energy Savings (MWh)  2709 non-null   object 
 11  Utility Peak Demand Savings (M

In [85]:
merged_5 = merged_5.dropna(subset=['Total Generation Rank'])
merged_5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2709 entries, 12 to 3846
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 2670 non-null   object 
 1   State                                2709 non-null   object 
 2   Total Generation Rank                2709 non-null   object 
 3   Electric Generation Rank             2709 non-null   object 
 4   Commercial Generation Rank           2709 non-null   object 
 5   Capacity Growth Status               2709 non-null   object 
 6   CAIDI w/ Major Events                2709 non-null   float64
 7   CAIDI w/o Major Events               2709 non-null   float64
 8   SAIFI Major Events                   2709 non-null   object 
 9   BA Code                              2709 non-null   object 
 10  Utility Annual Energy Savings (MWh)  2709 non-null   object 
 11  Utility Peak Demand Savings (MW)  

In [86]:
merged_5 = merged_5.dropna(subset=['City'])
merged_5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2670 entries, 12 to 3805
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 2670 non-null   object 
 1   State                                2670 non-null   object 
 2   Total Generation Rank                2670 non-null   object 
 3   Electric Generation Rank             2670 non-null   object 
 4   Commercial Generation Rank           2670 non-null   object 
 5   Capacity Growth Status               2670 non-null   object 
 6   CAIDI w/ Major Events                2670 non-null   float64
 7   CAIDI w/o Major Events               2670 non-null   float64
 8   SAIFI Major Events                   2670 non-null   object 
 9   BA Code                              2670 non-null   object 
 10  Utility Annual Energy Savings (MWh)  2670 non-null   object 
 11  Utility Peak Demand Savings (MW)  

In [87]:
merged_5[merged_5['Total MWh Rank'].isna()]

Unnamed: 0,City,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,...,Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs),Region,Regional Electricity Demand,IXP Name,IXP_Count,Name,Total MWh Rank,Purchased Rank
12,Auburn,AL,High,High,Low,Growing,186.9,122.8,Moderate,SOCO,...,High,Moderate,Low,SE,Moderate,Ninja-IX Auburn,2.0,,,
13,Auburn,AL,High,High,Low,Growing,186.9,122.8,Moderate,TVA,...,Moderate,Moderate,Moderate,SE,Moderate,Ninja-IX Auburn,2.0,,,
34,Montgomery,AL,High,High,Low,Growing,186.9,122.8,Moderate,SOCO,...,High,Moderate,Low,SE,Moderate,Montgomery Internet Exchange - MGMix Montgomery,2.0,,,
35,Montgomery,AL,High,High,Low,Growing,186.9,122.8,Moderate,TVA,...,Moderate,Moderate,Moderate,SE,Moderate,Montgomery Internet Exchange - MGMix Montgomery,2.0,,,
86,Flagstaff,AZ,High,High,Moderate,Growing,108.6,84.8,Low,AZPS,...,High,High,Moderate,SW,Low,Amateur Radio Internet Exchange - ARIX,8.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3551,Reston,VA,High,High,High,Growing,151.3,124.2,Moderate,PJM,...,High,High,Moderate,MIDA,High,LINX Northern Virginia,11.0,,,
3565,Montpelier,VT,Low,Low,Low,Shrinking,316.5,140.0,High,ISNE,...,Moderate,Moderate,High,NE,Low,Lightboard Burlington IX (Vermont),1.0,,,
3566,Montpelier,VT,Low,Low,Low,Shrinking,316.5,140.0,High,ISNE,...,Moderate,High,Moderate,NE,Low,Lightboard Burlington IX (Vermont),1.0,,,
3567,White River Junction,VT,Low,Low,Low,Shrinking,316.5,140.0,High,ISNE,...,Moderate,Moderate,High,NE,Low,Lightboard Burlington IX (Vermont),1.0,,,


In [88]:
merged_5.isna().sum()

City                                     0
State                                    0
Total Generation Rank                    0
Electric Generation Rank                 0
Commercial Generation Rank               0
Capacity Growth Status                   0
CAIDI w/ Major Events                    0
CAIDI w/o Major Events                   0
SAIFI Major Events                       0
BA Code                                  0
Utility Annual Energy Savings (MWh)      0
Utility Peak Demand Savings (MW)         0
Utility Annual Incentive Cost ($)        0
Utility Commercial Avg Life (Yrs)        0
Region                                   0
Regional Electricity Demand             79
IXP Name                                 0
IXP_Count                                0
Name                                   954
Total MWh Rank                         954
Purchased Rank                         954
dtype: int64

## Loading in Emissions Data

In [89]:
emissions = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Emissions.csv', index_col=0)

emissions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7511 entries, 0 to 7510
Data columns (total 10 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   REPORTING YEAR                   7511 non-null   int64  
 1   FACILITY NAME                    7511 non-null   object 
 2   LATITUDE                         7511 non-null   float64
 3   LONGITUDE                        7511 non-null   float64
 4   CITY                             7511 non-null   object 
 5   COUNTY                           7511 non-null   object 
 6   STATE                            7511 non-null   object 
 7   ZIP CODE                         7511 non-null   int64  
 8   PARENT COMPANIES                 7511 non-null   object 
 9   GHG QUANTITY (METRIC TONS CO2e)  7511 non-null   int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 645.5+ KB


In [90]:
emissions.head()

Unnamed: 0,REPORTING YEAR,FACILITY NAME,LATITUDE,LONGITUDE,CITY,COUNTY,STATE,ZIP CODE,PARENT COMPANIES,GHG QUANTITY (METRIC TONS CO2e)
0,2023,(430) Civitas Resources - Permian Basin,39.74522,-104.989197,Denver,Unknown,CO,80202,CIVITAS RESOURCES INC,844548
1,2023,(540) Civitas Resources - Denver Basin,39.74431,-104.98858,Denver,Unknown,CO,80202,CIVITAS RESOURCES INC,887487
2,2023,(540) Civitas Resources - GB - Denver Basin,39.745822,-104.989243,Denver,Unknown,CO,80202,CIVITAS RESOURCES INC,110747
3,2023,121 REGIONAL DISPOSAL FACILITY,33.29857,-96.53586,MELISSA,COLLIN COUNTY,TX,75454,NORTH TEXAS MUNICIPAL WATER DISTRICT,288302
4,2023,15-18565/15-18662,37.274127,-83.239034,Hazard,PERRY COUNTY,KY,40701,CAMBRIAN COAL LLC,122327


In [91]:
# Standardize column names
emissions = emissions.rename(columns={'LATITUDE': 'Latitude',
                                      'LONGITUDE': 'Longitude',
                                      'CITY':'City',
                                      'COUNTY':'County',
                                      'STATE':'State',
                                      'ZIP CODE':'Zip Code',
                                      'GHG QUANTITY (METRIC TONS CO2e)':'GHG (MTCO2)'})


#Drop unnecessary columns
emissions =emissions.drop(columns=['REPORTING YEAR', 'FACILITY NAME', 'PARENT COMPANIES'])


In [92]:
emissions.head()

Unnamed: 0,Latitude,Longitude,City,County,State,Zip Code,GHG (MTCO2)
0,39.74522,-104.989197,Denver,Unknown,CO,80202,844548
1,39.74431,-104.98858,Denver,Unknown,CO,80202,887487
2,39.745822,-104.989243,Denver,Unknown,CO,80202,110747
3,33.29857,-96.53586,MELISSA,COLLIN COUNTY,TX,75454,288302
4,37.274127,-83.239034,Hazard,PERRY COUNTY,KY,40701,122327


In [93]:
# Standardize  City and County values as title case
emissions['City'] = emissions['City'].str.title()
emissions['County'] = emissions['County'].str.title()
emissions.head()

Unnamed: 0,Latitude,Longitude,City,County,State,Zip Code,GHG (MTCO2)
0,39.74522,-104.989197,Denver,Unknown,CO,80202,844548
1,39.74431,-104.98858,Denver,Unknown,CO,80202,887487
2,39.745822,-104.989243,Denver,Unknown,CO,80202,110747
3,33.29857,-96.53586,Melissa,Collin County,TX,75454,288302
4,37.274127,-83.239034,Hazard,Perry County,KY,40701,122327


In [94]:
emissions.isna().value_counts()

Latitude  Longitude  City   County  State  Zip Code  GHG (MTCO2)
False     False      False  False   False  False     False          7511
Name: count, dtype: int64

In [95]:
# Group by city and state and take the sum of GHG emissions to create a new column called City Emissions
emissions['City Emissions'] = emissions.groupby(['State', 'City'])['GHG (MTCO2)'].transform('sum')

# Group by state and take the sum of GHG emissions to create a new column called State Emissions
emissions['State Emissions'] = emissions.groupby(['State'])['GHG (MTCO2)'].transform('sum')
emissions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7511 entries, 0 to 7510
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Latitude         7511 non-null   float64
 1   Longitude        7511 non-null   float64
 2   City             7511 non-null   object 
 3   County           7511 non-null   object 
 4   State            7511 non-null   object 
 5   Zip Code         7511 non-null   int64  
 6   GHG (MTCO2)      7511 non-null   int64  
 7   City Emissions   7511 non-null   int64  
 8   State Emissions  7511 non-null   int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 586.8+ KB


In [96]:
# Drop Longitude and Latitude columns since we do not need facility locations
emissions = emissions.drop(columns=['Longitude', 'Latitude'])
emissions.head()

Unnamed: 0,City,County,State,Zip Code,GHG (MTCO2),City Emissions,State Emissions
0,Denver,Unknown,CO,80202,844548,21738141,62456744
1,Denver,Unknown,CO,80202,887487,21738141,62456744
2,Denver,Unknown,CO,80202,110747,21738141,62456744
3,Melissa,Collin County,TX,75454,288302,288302,485547330
4,Hazard,Perry County,KY,40701,122327,164154,65132445


In [97]:
# Group by county and state and take the sum of GHG emissions to create a County Emissions columns
emissions['County Emissions'] = emissions.groupby(['State', 'County'])['GHG (MTCO2)'].transform('sum')
emissions.head()

Unnamed: 0,City,County,State,Zip Code,GHG (MTCO2),City Emissions,State Emissions,County Emissions
0,Denver,Unknown,CO,80202,844548,21738141,62456744,12704001
1,Denver,Unknown,CO,80202,887487,21738141,62456744,12704001
2,Denver,Unknown,CO,80202,110747,21738141,62456744,12704001
3,Melissa,Collin County,TX,75454,288302,288302,485547330,1381024
4,Hazard,Perry County,KY,40701,122327,164154,65132445,164154


In [98]:
emissions[emissions['Zip Code'] == 0]

Unnamed: 0,City,County,State,Zip Code,GHG (MTCO2),City Emissions,State Emissions,County Emissions
63,Offshore,Unknown,TX,0,180279,349404,485547330,66853798
504,Offshore,Unknown,LA,0,134118,5733285,145975918,7986796
539,Offshore,Unknown,LA,0,256468,5733285,145975918,7986796
623,Offshore,Unknown,CA,0,35125,35125,99305182,6791172
893,Offshore,Unknown,TX,0,38935,349404,485547330,66853798
...,...,...,...,...,...,...,...,...
7268,Offshore,Unknown,LA,0,194726,5733285,145975918,7986796
7269,Offshore,Unknown,LA,0,53572,5733285,145975918,7986796
7322,Offshore,Unknown,LA,0,12452,5733285,145975918,7986796
7323,Offshore,Unknown,LA,0,8883,5733285,145975918,7986796


In [99]:
# Drop rows where Zip Code is 0
emissions = emissions[emissions['Zip Code'] != 0]

In [100]:
emissions[emissions['County'] == 'Unknown'].head(50)

Unnamed: 0,City,County,State,Zip Code,GHG (MTCO2),City Emissions,State Emissions,County Emissions
0,Denver,Unknown,CO,80202,844548,21738141,62456744,12704001
1,Denver,Unknown,CO,80202,887487,21738141,62456744,12704001
2,Denver,Unknown,CO,80202,110747,21738141,62456744,12704001
6,Denver,Unknown,CO,80203,27277,21738141,62456744,12704001
7,Denver,Unknown,CO,80203,27277,21738141,62456744,12704001
9,Oklahoma City,Unknown,OK,73102,1166132,13403543,70166208,11908297
11,Houston,Unknown,TX,77079,13382,57927547,485547330,66853798
16,Oklahoma City,Unknown,OK,73142,182531,13403543,70166208,11908297
18,Oklahoma City,Unknown,OK,73102,310278,13403543,70166208,11908297
19,Denver,Unknown,CO,80202,125092,21738141,62456744,12704001


In [101]:
pip install pgeocode

Note: you may need to restart the kernel to use updated packages.


In [102]:
# Map the zip codes to their counties
import pgeocode    

In [103]:
# Convert zip codes to strings first and create the mapping
emissions['Zip Code'] = emissions['Zip Code'].astype(str)

# Create a clean mapping dictionary
zip_to_county = {}
nomi = pgeocode.Nominatim('us')

# Loop through unique zip codes only
for zip_code in emissions['Zip Code'].unique():
    try:
        location = nomi.query_postal_code(zip_code)
        if location is not None and not pd.isna(location['county_name']):
            county = location['county_name'] + ' County'
            zip_to_county[zip_code] = county
    except:
        continue

# Apply the mapping with a safe approach
emissions.loc[emissions['County'] == 'Unknown', 'County'] = \
    emissions.loc[emissions['County'] == 'Unknown', 'Zip Code'].map(zip_to_county).fillna('Unknown')

In [104]:
emissions[emissions['County'].isna()]

Unnamed: 0,City,County,State,Zip Code,GHG (MTCO2),City Emissions,State Emissions,County Emissions


In [105]:
emissions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7409 entries, 0 to 7510
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   City              7409 non-null   object
 1   County            7409 non-null   object
 2   State             7409 non-null   object
 3   Zip Code          7409 non-null   object
 4   GHG (MTCO2)       7409 non-null   int64 
 5   City Emissions    7409 non-null   int64 
 6   State Emissions   7409 non-null   int64 
 7   County Emissions  7409 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 520.9+ KB


In [106]:
# Drop VI, Guam from dataframe
emissions = emissions[~emissions['State'].isin(['VI', 'GU'])]


emissions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7395 entries, 0 to 7510
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   City              7395 non-null   object
 1   County            7395 non-null   object
 2   State             7395 non-null   object
 3   Zip Code          7395 non-null   object
 4   GHG (MTCO2)       7395 non-null   int64 
 5   City Emissions    7395 non-null   int64 
 6   State Emissions   7395 non-null   int64 
 7   County Emissions  7395 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 520.0+ KB


In [107]:
# Count duplicates of City State combinations
emissions.groupby(['State', 'City', 'County']).size().sort_values(ascending=False).head(30)


State  City            County                 
TX     Houston         Harris County              231
       Dallas          Dallas County               87
CO     Denver          Denver County               79
OK     Oklahoma City   Oklahoma County             56
TX     Midland         Midland County              39
OK     Tulsa           Tulsa County                38
CA     Bakersfield     Kern County                 29
TX     Fort Worth      Tarrant County              27
       Pasadena        Harris County               24
       Austin          Travis County               24
KY     Louisville      Jefferson County            20
TX     Plano           Collin County               17
       San Antonio     Bexar County                16
LA     Geismar         Ascension Parish            16
NV     Las Vegas       Clark County                15
AK     Prudhoe Bay     North Slope Borough         14
OK     Tulsa           Wagoner County              14
LA     Baton Rouge     East Baton R

In [108]:
# Count duplicates of County State combinations
emissions.groupby(['State', 'County']).size().sort_values(ascending=False).head(30)

State  County               
TX     Harris County            293
       Dallas County            102
CO     Denver County             80
OK     Oklahoma County           67
CA     Kern County               55
       Los Angeles County        52
TX     Midland County            42
OK     Tulsa County              41
TX     Tarrant County            32
CO     Weld County               30
TX     Jefferson County          29
LA     Calcasieu Parish          27
MI     Wayne County              26
TX     Travis County             25
       Collin County             25
CA     San Bernardino County     24
NV     Clark County              24
PA     Allegheny County          24
CA     Contra Costa County       23
IL     Cook County               22
AL     Mobile County             22
KY     Jefferson County          20
TX     Nueces County             20
AZ     Maricopa County           20
TX     Bexar County              20
       Brazoria County           20
LA     Ascension Parish          19

In [109]:
# Function to standardize county names
def standardize_county_name(county):
    # Remove any trailing/leading whitespace
    county = county.strip()
    
    # If it already ends with 'County', return as is
    if county.endswith('County'):
        return county
    # If it ends with 'Parish', return as is (for Louisiana)
    elif county.endswith('Parish'):
        return county
    # If it's not 'Unknown', append 'County'
    elif county != 'Unknown':
        return f"{county} County"
    return county

# Apply the standardization
emissions['County'] = emissions['County'].apply(standardize_county_name)

# Verify the results
print("After standardization:")
print(emissions.groupby(['State', 'County']).size().sort_values(ascending=False).head(30))

After standardization:
State  County               
TX     Harris County            302
       Dallas County            104
CO     Denver County             81
OK     Oklahoma County           69
CA     Los Angeles County        65
       Kern County               58
OK     Tulsa County              44
TX     Midland County            42
CO     Weld County               34
TX     Tarrant County            33
NV     Clark County              31
TX     Jefferson County          30
AZ     Maricopa County           30
MI     Wayne County              30
PA     Allegheny County          29
TX     Travis County             27
LA     Calcasieu Parish          27
CA     San Bernardino County     27
TX     Collin County             26
CA     Contra Costa County       26
AL     Mobile County             25
TX     Bexar County              24
IL     Cook County               24
TX     Nueces County             23
NM     Lea County                22
CA     San Diego County          21
TX     Brazo

In [110]:
# Count duplicates of City County State combinations
emissions.groupby(['State', 'City', 'County']).size().sort_values(ascending=False).head(30)

State  City            County                    
TX     Houston         Harris County                 234
       Dallas          Dallas County                  88
CO     Denver          Denver County                  80
OK     Oklahoma City   Oklahoma County                56
TX     Midland         Midland County                 39
OK     Tulsa           Tulsa County                   39
CA     Bakersfield     Kern County                    29
TX     Fort Worth      Tarrant County                 28
       Pasadena        Harris County                  27
       Austin          Travis County                  26
KY     Louisville      Jefferson County               20
NV     Las Vegas       Clark County                   19
TX     San Antonio     Bexar County                   19
       Plano           Collin County                  17
       Corpus Christi  Nueces County                  17
LA     Geismar         Ascension Parish               16
AK     Prudhoe Bay     North Slope Bor

In [111]:
emissions.describe()

Unnamed: 0,GHG (MTCO2),City Emissions,State Emissions,County Emissions
count,7395.0,7395.0,7395.0,7395.0
mean,347359.9,3826918.0,137598300.0,6412769.0
std,977084.9,10690360.0,163957400.0,15798550.0
min,1.0,1.0,279350.0,2.0
25%,31822.0,75714.0,38090040.0,186294.0
50%,66124.0,298714.0,69044090.0,684000.0
75%,186818.5,2024348.0,109124900.0,3205540.0
max,16558380.0,57927550.0,485547300.0,66853800.0


In [112]:
# Bin GHG quantity based on distribution of city emissions
def categorize_city_ghg(value):
    if value <= 75714:                        # 0-25th percentile
        return 'Low'
    elif 75714 < value <= 298714:             # 25th-50th percentile
        return 'Moderate'
    elif 298714 < value <= 2024348:           # 50th-75th percentile
        return 'High'            
    else:                                     # Above 75th percentile
        return 'Very High'

# Bin GHG quantity based on distribution of county emissions
def categorize_county_ghg(value):
    if value <= 186294:                       # 0-25th percentile
        return 'Low'
    elif 186294 < value <= 684000:            # 25th-50th percentile
        return 'Moderate'
    elif 684000 < value <= 3205540:           # 50th-75th percentile
        return 'High'
    else:                                     # Above 75th percentile
        return 'Very High'

# Bin GHG quantity based on distribution of state emissions
def categorize_state_ghg(value):
    if value <= 38090040:                     # 0-25th percentile
        return 'Low'
    elif 38090040 < value <= 69044090:        # 25th-50th percentile
        return 'Moderate'
    elif 69044090 < value <= 109124900:       # 50th-75th percentile
        return 'High'
    else:                                     # Above 75th percentile
        return 'Very High'

# Apply the categorization
emissions['City Emissions Level'] = emissions['City Emissions'].apply(categorize_city_ghg)
emissions['County Emissions Level'] = emissions['County Emissions'].apply(categorize_county_ghg)
emissions['State Emissions Level'] = emissions['State Emissions'].apply(categorize_state_ghg)


In [113]:
emissions.head()

Unnamed: 0,City,County,State,Zip Code,GHG (MTCO2),City Emissions,State Emissions,County Emissions,City Emissions Level,County Emissions Level,State Emissions Level
0,Denver,Denver County,CO,80202,844548,21738141,62456744,12704001,Very High,Very High,Moderate
1,Denver,Denver County,CO,80202,887487,21738141,62456744,12704001,Very High,Very High,Moderate
2,Denver,Denver County,CO,80202,110747,21738141,62456744,12704001,Very High,Very High,Moderate
3,Melissa,Collin County,TX,75454,288302,288302,485547330,1381024,Moderate,High,Very High
4,Hazard,Perry County,KY,40701,122327,164154,65132445,164154,Moderate,Low,Moderate


In [114]:

# Drop the original numeric columns
emissions = emissions.drop(columns=['City Emissions', 'County Emissions', 'State Emissions', 'GHG (MTCO2)'])
emissions.head()

Unnamed: 0,City,County,State,Zip Code,City Emissions Level,County Emissions Level,State Emissions Level
0,Denver,Denver County,CO,80202,Very High,Very High,Moderate
1,Denver,Denver County,CO,80202,Very High,Very High,Moderate
2,Denver,Denver County,CO,80202,Very High,Very High,Moderate
3,Melissa,Collin County,TX,75454,Moderate,High,Very High
4,Hazard,Perry County,KY,40701,Moderate,Low,Moderate


In [115]:
# Drop duplicates based on State, City, County while keeping the first occurrence 
emissions = emissions.drop_duplicates(subset=['State', 'City', 'County'])


In [116]:
# Verify the results

emissions.groupby(['State', 'City', 'County']).size().sort_values(ascending=False).head(30)

State  City          County               
AK     Akutan        Aleutians East County    1
OH     Toronto       Jefferson County         1
OK     Blackwell     Kay County               1
       Bradley       Grady County             1
       Broken Arrow  Wagoner County           1
       Broken Bow    Mccurtain County         1
       Butler        Custer County            1
       Byars         Mcclain County           1
       Calumet       Canadian County          1
       Calvin        Hughes County            1
       Canute        Washita County           1
       Cashion       Kingfisher County        1
                     Logan County             1
       Chouteau      Mayes County             1
       Claremore     Rogers County            1
       Coalgate      Coal County              1
       Cogar         Caddo County             1
       Countyline    Stephens County          1
       Crawford      Roger Mills County       1
       Custer City   Custer County           

In [117]:
merged_5.head()

Unnamed: 0,City,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,...,Utility Peak Demand Savings (MW),Utility Annual Incentive Cost ($),Utility Commercial Avg Life (Yrs),Region,Regional Electricity Demand,IXP Name,IXP_Count,Name,Total MWh Rank,Purchased Rank
12,Auburn,AL,High,High,Low,Growing,186.9,122.8,Moderate,SOCO,...,High,Moderate,Low,SE,Moderate,Ninja-IX Auburn,2.0,,,
13,Auburn,AL,High,High,Low,Growing,186.9,122.8,Moderate,TVA,...,Moderate,Moderate,Moderate,SE,Moderate,Ninja-IX Auburn,2.0,,,
34,Montgomery,AL,High,High,Low,Growing,186.9,122.8,Moderate,SOCO,...,High,Moderate,Low,SE,Moderate,Montgomery Internet Exchange - MGMix Montgomery,2.0,,,
35,Montgomery,AL,High,High,Low,Growing,186.9,122.8,Moderate,TVA,...,Moderate,Moderate,Moderate,SE,Moderate,Montgomery Internet Exchange - MGMix Montgomery,2.0,,,
86,Flagstaff,AZ,High,High,Moderate,Growing,108.6,84.8,Low,AZPS,...,High,High,Moderate,SW,Low,Amateur Radio Internet Exchange - ARIX,8.0,,,


In [118]:
# Merge emissions with merged 5
merged_6 = pd.merge(merged_5, emissions, on=['State', 'City'], how='outer')

merged_6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7470 entries, 0 to 7469
Data columns (total 26 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 7470 non-null   object 
 1   State                                7470 non-null   object 
 2   Total Generation Rank                3447 non-null   object 
 3   Electric Generation Rank             3447 non-null   object 
 4   Commercial Generation Rank           3447 non-null   object 
 5   Capacity Growth Status               3447 non-null   object 
 6   CAIDI w/ Major Events                3447 non-null   float64
 7   CAIDI w/o Major Events               3447 non-null   float64
 8   SAIFI Major Events                   3447 non-null   object 
 9   BA Code                              3447 non-null   object 
 10  Utility Annual Energy Savings (MWh)  3447 non-null   object 
 11  Utility Peak Demand Savings (M

In [119]:
merged_6.head()

Unnamed: 0,City,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,...,IXP Name,IXP_Count,Name,Total MWh Rank,Purchased Rank,County,Zip Code,City Emissions Level,County Emissions Level,State Emissions Level
0,Akutan,AK,,,,,,,,,...,,,,,,Aleutians East County,99553,Low,High,Low
1,Anchor Point,AK,,,,,,,,,...,,,,,,Kenai Peninsula County,99556,Low,High,Low
2,Anchorage,AK,,,,,,,,,...,,,,,,Anchorage Municipality County,99503,Very High,High,Low
3,Anchorage,AK,,,,,,,,,...,,,,,,Kenai Peninsula County,99503,Very High,Moderate,Low
4,Anchorage,AK,,,,,,,,,...,,,,,,Kenai Peninsula Borough County,99503,Very High,High,Low


In [120]:
# Emissions related data that is missing will be filled with unknown
emissions_cols = ['City Emissions Level', 'County Emissions Level', 'State Emissions Level']
merged_6[emissions_cols] = merged_6[emissions_cols].fillna('Unknown')

merged_6.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7470 entries, 0 to 7469
Data columns (total 26 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 7470 non-null   object 
 1   State                                7470 non-null   object 
 2   Total Generation Rank                3447 non-null   object 
 3   Electric Generation Rank             3447 non-null   object 
 4   Commercial Generation Rank           3447 non-null   object 
 5   Capacity Growth Status               3447 non-null   object 
 6   CAIDI w/ Major Events                3447 non-null   float64
 7   CAIDI w/o Major Events               3447 non-null   float64
 8   SAIFI Major Events                   3447 non-null   object 
 9   BA Code                              3447 non-null   object 
 10  Utility Annual Energy Savings (MWh)  3447 non-null   object 
 11  Utility Peak Demand Savings (M

In [121]:
# Drop rows where essential ranking columns are missing 
merged_6 = merged_6.dropna(subset=['Total Generation Rank'])
merged_6.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3447 entries, 34 to 7312
Data columns (total 26 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 3447 non-null   object 
 1   State                                3447 non-null   object 
 2   Total Generation Rank                3447 non-null   object 
 3   Electric Generation Rank             3447 non-null   object 
 4   Commercial Generation Rank           3447 non-null   object 
 5   Capacity Growth Status               3447 non-null   object 
 6   CAIDI w/ Major Events                3447 non-null   float64
 7   CAIDI w/o Major Events               3447 non-null   float64
 8   SAIFI Major Events                   3447 non-null   object 
 9   BA Code                              3447 non-null   object 
 10  Utility Annual Energy Savings (MWh)  3447 non-null   object 
 11  Utility Peak Demand Savings (MW)  

In [122]:
print(merged_5.shape)
print(emissions.shape)


(2670, 21)
(4122, 7)


In [123]:
# Check duplicates in merged_6
print("\nDuplicates in merged_6:")
print(merged_6.groupby(['State', 'City', 'County']).size().sort_values(ascending=False).head(30))



Duplicates in merged_6:
State  City           County              
CA     Fremont        Alameda County          286
       San Francisco  San Francisco County    156
       Los Angeles    Los Angeles County      156
TX     Dallas         Dallas County           140
                      Eddy County             140
                      Montgomery County       140
WA     Seattle        King County             102
MA     Boston         Suffolk County           96
NY     New York       Kings County             90
                      New York County          90
OR     Eugene         Lane County              90
FL     Miami          Miami-Dade County        84
CA     San Jose       Santa Clara County       78
CO     Denver         Adams County             65
                      Denver County            65
OR     Portland       Multnomah County         60
OH     Columbus       Washington County        60
                      Franklin County          60
AZ     Phoenix        Maricopa C

# Loading in Broadband Data

In [124]:
broadband = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Broadband_Data.csv', index_col=0)

broadband.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11385 entries, 25125 to 580965
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   County                  11385 non-null  object
 1   City                    11385 non-null  object
 2   State                   11385 non-null  object
 3   Geog_Area_Type          11385 non-null  object
 4   Gigabit_Fiber_Coverage  11385 non-null  object
 5   Business_Density        11385 non-null  object
dtypes: object(6)
memory usage: 622.6+ KB


In [125]:
broadband.head()

Unnamed: 0,County,City,State,Geog_Area_Type,Gigabit_Fiber_Coverage,Business_Density
25125,Autauga County,Autauga,AL,Total,Good,Moderate
25155,Baldwin County,Baldwin,AL,Total,Moderate,High
25185,Barbour County,Barbour,AL,Total,Limited,Moderate
25215,Bibb County,Bibb,AL,Total,Moderate,Low
25245,Blount County,Blount,AL,Total,Limited,Moderate


In [126]:
broadband = broadband[broadband['Geog_Area_Type'] == 'Total']

# Drop Geog Area Type
broadband = broadband.drop(columns=['Geog_Area_Type'])

In [127]:
# Count duplicates of City County State
broadband.groupby(['State', 'City', 'County']).size().sort_values(ascending=False).head(30)


State  City                    County                
AK     Aleutians East Borough  Aleutians East Borough    1
OH     Lake                    Lake County               1
       Licking                 Licking County            1
       Logan                   Logan County              1
       Lorain                  Lorain County             1
       Lucas                   Lucas County              1
       Madison                 Madison County            1
       Mahoning                Mahoning County           1
       Marion                  Marion County             1
       Medina                  Medina County             1
       Meigs                   Meigs County              1
       Mercer                  Mercer County             1
       Miami                   Miami County              1
       Monroe                  Monroe County             1
       Montgomery              Montgomery County         1
       Morgan                  Morgan County             1
  

In [128]:
# Merge Broadband with merged_6
merged_7 = pd.merge(merged_6, broadband, on=['State', 'City', 'County'], how='outer')

merged_7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6577 entries, 0 to 6576
Data columns (total 28 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 6577 non-null   object 
 1   State                                6577 non-null   object 
 2   Total Generation Rank                3447 non-null   object 
 3   Electric Generation Rank             3447 non-null   object 
 4   Commercial Generation Rank           3447 non-null   object 
 5   Capacity Growth Status               3447 non-null   object 
 6   CAIDI w/ Major Events                3447 non-null   float64
 7   CAIDI w/o Major Events               3447 non-null   float64
 8   SAIFI Major Events                   3447 non-null   object 
 9   BA Code                              3447 non-null   object 
 10  Utility Annual Energy Savings (MWh)  3447 non-null   object 
 11  Utility Peak Demand Savings (M

In [129]:
merged_7.head()

Unnamed: 0,City,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,...,Name,Total MWh Rank,Purchased Rank,County,Zip Code,City Emissions Level,County Emissions Level,State Emissions Level,Gigabit_Fiber_Coverage,Business_Density
0,Aleutians East Borough,AK,,,,,,,,,...,,,,Aleutians East Borough,,,,,Limited,Low
1,Aleutians West Census Area,AK,,,,,,,,,...,,,,Aleutians West Census Area,,,,,Limited,Low
2,Anchorage Municipality,AK,,,,,,,,,...,,,,Anchorage Municipality,,,,,Limited,High
3,Bethel Census Area,AK,,,,,,,,,...,,,,Bethel Census Area,,,,,Limited,Low
4,Bristol Bay Borough,AK,,,,,,,,,...,,,,Bristol Bay Borough,,,,,Limited,Low


In [130]:
merged_7['Region'].value_counts()

Region
CAL            858
TEX            742
NW             536
MIDW           426
NY             200
FLA            140
MIDA           139
CENT           124
SW             108
NE              71
SE              65
CAR             32
Independent      6
Name: count, dtype: int64

In [131]:
# Reapplying the mapping region to states

region_to_state = {
    'CAL':['CA'],
    'NW': ['WA', 'OR', 'ID', 'MT','CO','WY', 'AK'],
    'CAR': ['NC', 'SC'],
    'NY': ['NY'],
    'MIDA': ['PA', 'NJ', 'DE', 'MD', 'VA', 'DC', 'WV'],
    'SE':['GA', 'AL', 'MS', 'LA', 'AR'],
    'FLA': ['FL'],
    'SW': ['AZ', 'NM', 'NV', 'UT'],
    'TN': ['TN'],
    'MIDW': ['IL', 'IN','IA', 'MI', 'MN', 'OH','WI'],
    'TEX': ['TX'],
    'NE': ['ME', 'VT', 'NH', 'MA', 'CT', 'RI'],
    'CENT': ['KS', 'NE', 'SD', 'ND','OK', 'MO', 'KY']
}

# Create reverse mapping (state to region)
state_to_region = {state: region for region, states in region_to_state.items() for state in states}

# Apply mapping
merged_7['Region'] = merged_7['State'].map(state_to_region)
merged_7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6577 entries, 0 to 6576
Data columns (total 28 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 6577 non-null   object 
 1   State                                6577 non-null   object 
 2   Total Generation Rank                3447 non-null   object 
 3   Electric Generation Rank             3447 non-null   object 
 4   Commercial Generation Rank           3447 non-null   object 
 5   Capacity Growth Status               3447 non-null   object 
 6   CAIDI w/ Major Events                3447 non-null   float64
 7   CAIDI w/o Major Events               3447 non-null   float64
 8   SAIFI Major Events                   3447 non-null   object 
 9   BA Code                              3447 non-null   object 
 10  Utility Annual Energy Savings (MWh)  3447 non-null   object 
 11  Utility Peak Demand Savings (M

## Loading in Disaster Risk Data

In [132]:
disaster = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Cleaned Data/Cleaned_Disaster_Risk.csv', index_col=0)

disaster.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3137 entries, 0 to 3142
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   State        3137 non-null   object 
 1   County       3137 non-null   object 
 2   Population   3137 non-null   int64  
 3   Risk_Score   3137 non-null   float64
 4   Risk_Rating  3137 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 147.0+ KB


In [133]:
disaster.head()

Unnamed: 0,State,County,Population,Risk_Score,Risk_Rating
0,AL,Autauga,58764,49.22049,Relatively Low
1,AL,Baldwin,231365,97.709195,Relatively High
2,AL,Barbour,25160,56.188355,Relatively Low
3,AL,Bibb,22239,32.484887,Very Low
4,AL,Blount,58992,65.128858,Relatively Low


In [134]:
disaster = disaster.drop(columns=['Population'])

disaster.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3137 entries, 0 to 3142
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   State        3137 non-null   object 
 1   County       3137 non-null   object 
 2   Risk_Score   3137 non-null   float64
 3   Risk_Rating  3137 non-null   object 
dtypes: float64(1), object(3)
memory usage: 122.5+ KB


In [135]:
# Merge disaster with merged_7  
merged_8 = pd.merge(merged_7, disaster, on=['State', 'County'], how='outer')

merged_8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9713 entries, 0 to 9712
Data columns (total 30 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 6577 non-null   object 
 1   State                                9713 non-null   object 
 2   Total Generation Rank                3447 non-null   object 
 3   Electric Generation Rank             3447 non-null   object 
 4   Commercial Generation Rank           3447 non-null   object 
 5   Capacity Growth Status               3447 non-null   object 
 6   CAIDI w/ Major Events                3447 non-null   float64
 7   CAIDI w/o Major Events               3447 non-null   float64
 8   SAIFI Major Events                   3447 non-null   object 
 9   BA Code                              3447 non-null   object 
 10  Utility Annual Energy Savings (MWh)  3447 non-null   object 
 11  Utility Peak Demand Savings (M

In [136]:
# Fill missing disaster data with Unknown
disaster_cols = ['Risk_Score', 'Risk_Rating']
merged_8[disaster_cols] = merged_8[disaster_cols].fillna('Unknown')

merged_8.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9713 entries, 0 to 9712
Data columns (total 30 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 6577 non-null   object 
 1   State                                9713 non-null   object 
 2   Total Generation Rank                3447 non-null   object 
 3   Electric Generation Rank             3447 non-null   object 
 4   Commercial Generation Rank           3447 non-null   object 
 5   Capacity Growth Status               3447 non-null   object 
 6   CAIDI w/ Major Events                3447 non-null   float64
 7   CAIDI w/o Major Events               3447 non-null   float64
 8   SAIFI Major Events                   3447 non-null   object 
 9   BA Code                              3447 non-null   object 
 10  Utility Annual Energy Savings (MWh)  3447 non-null   object 
 11  Utility Peak Demand Savings (M

In [137]:
len(merged_8['State'].value_counts())


52

In [138]:
# Drop any remaining US territories if they exist
territories = ['PR', 'GU', 'VI', 'AS', 'MP', 'CNMI']
merged_8= merged_8[~merged_8['State'].isin(territories)]

len(merged_8['State'].value_counts())


51

There are 51 states because i included District of Columbia as a state

In [139]:
merged_8.columns

Index(['City', 'State', 'Total Generation Rank', 'Electric Generation Rank',
       'Commercial Generation Rank', 'Capacity Growth Status',
       'CAIDI w/ Major Events', 'CAIDI w/o Major Events', 'SAIFI Major Events',
       'BA Code', 'Utility Annual Energy Savings (MWh)',
       'Utility Peak Demand Savings (MW)', 'Utility Annual Incentive Cost ($)',
       'Utility Commercial Avg Life (Yrs)', 'Region',
       'Regional Electricity Demand', 'IXP Name', 'IXP_Count', 'Name',
       'Total MWh Rank', 'Purchased Rank', 'County', 'Zip Code',
       'City Emissions Level', 'County Emissions Level',
       'State Emissions Level', 'Gigabit_Fiber_Coverage', 'Business_Density',
       'Risk_Score', 'Risk_Rating'],
      dtype='object')

In [140]:
merged_8 = merged_8.replace('None', 'Unknown')

# Propagating State Level Data

In [141]:
# First, create state-level mappings for relevant columns
state_mappings = {
    'Total Generation Rank': merged_8.groupby('State')['Total Generation Rank'].first(),
    'Electric Generation Rank': merged_8.groupby('State')['Electric Generation Rank'].first(),
    'Commercial Generation Rank': merged_8.groupby('State')['Commercial Generation Rank'].first(),
    'Capacity Growth Status': merged_8.groupby('State')['Capacity Growth Status'].first(),
    'CAIDI w/ Major Events': merged_8.groupby('State')['CAIDI w/ Major Events'].first(),
    'CAIDI w/o Major Events': merged_8.groupby('State')['CAIDI w/o Major Events'].first(),
    'SAIFI Major Events': merged_8.groupby('State')['SAIFI Major Events'].first(),
    'BA Code': merged_8.groupby('State')['BA Code'].first(),
    'Utility Annual Energy Savings (MWh)': merged_8.groupby('State')['Utility Annual Energy Savings (MWh)'].first(),
    'Utility Peak Demand Savings (MW)': merged_8.groupby('State')['Utility Peak Demand Savings (MW)'].first(),
    'Utility Annual Incentive Cost ($)': merged_8.groupby('State')['Utility Annual Incentive Cost ($)'].first(),
    'Utility Commercial Avg Life (Yrs)': merged_8.groupby('State')['Utility Commercial Avg Life (Yrs)'].first(),
    'Regional Electricity Demand': merged_8.groupby('State')['Regional Electricity Demand'].first(),
    'State Emissions Level': merged_8.groupby('State')['State Emissions Level'].first(),
    'Region': merged_8.groupby('State')['Region'].first(),
    'IXP_Count': merged_8.groupby('State')['IXP_Count'].sum()
}

# Fill NaN values using state mappings
for col, mapping in state_mappings.items():
    merged_8[col] = merged_8.apply(
        lambda row: mapping[row['State']] if pd.isna(row[col]) else row[col],
        axis=1
    )
merged_8.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9710 entries, 0 to 9712
Data columns (total 30 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 6574 non-null   object 
 1   State                                9710 non-null   object 
 2   Total Generation Rank                8494 non-null   object 
 3   Electric Generation Rank             8494 non-null   object 
 4   Commercial Generation Rank           8494 non-null   object 
 5   Capacity Growth Status               8494 non-null   object 
 6   CAIDI w/ Major Events                8494 non-null   float64
 7   CAIDI w/o Major Events               8494 non-null   float64
 8   SAIFI Major Events                   8494 non-null   object 
 9   BA Code                              8494 non-null   object 
 10  Utility Annual Energy Savings (MWh)  8494 non-null   object 
 11  Utility Peak Demand Savings (MW)   

In [208]:
# Map the zip codes to their cities

#def zip_to_city(zip_code):
    
#merged_8['City'] = merged_8['Zip Code'].map(zip_to_city)


IndentationError: expected an indented block after function definition on line 3 (1608546845.py, line 5)

# Propagating the city level data

In [142]:
# City level columns: 
# Total MWh Rank, Purchased Rank, City Emissions Level

# City-level mappings
city_mappings = {
    'City Emissions Level': merged_8.groupby(['State', 'City'])['City Emissions Level'].first(),
    'Total MWh Rank': merged_8.groupby(['State', 'City'])['Total MWh Rank'].first(),
    'Purchased Rank': merged_8.groupby(['State', 'City'])['Purchased Rank'].first()
}

# Fill NaN values using city mappings
for col, mapping in city_mappings.items():
    merged_8[col] = merged_8.apply(
        lambda row: mapping.get((row['State'], row['City'])) if pd.isna(row[col]) or row[col] == 'Unknown' else row[col],
        axis=1
    )

# Verify the results
print("Number of unique cities:", len(merged_8.groupby(['State', 'City'])))
for col in city_mappings.keys():
    print(f"{col}: {len(merged_8[merged_8[col].isna()].groupby(['State', 'City']))} city combinations with NA values")


Number of unique cities: 3264
City Emissions Level: 3172 city combinations with NA values
Total MWh Rank: 3209 city combinations with NA values
Purchased Rank: 3209 city combinations with NA values


In [143]:
merged_8.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9710 entries, 0 to 9712
Data columns (total 30 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   City                                 6574 non-null   object 
 1   State                                9710 non-null   object 
 2   Total Generation Rank                8494 non-null   object 
 3   Electric Generation Rank             8494 non-null   object 
 4   Commercial Generation Rank           8494 non-null   object 
 5   Capacity Growth Status               8494 non-null   object 
 6   CAIDI w/ Major Events                8494 non-null   float64
 7   CAIDI w/o Major Events               8494 non-null   float64
 8   SAIFI Major Events                   8494 non-null   object 
 9   BA Code                              8494 non-null   object 
 10  Utility Annual Energy Savings (MWh)  8494 non-null   object 
 11  Utility Peak Demand Savings (MW)   

In [150]:
minimum_features = merged_8[['State','County', 'Risk_Score', 'Risk_Rating', 'IXP_Count','Region','Regional Electricity Demand','State Emissions Level','Utility Annual Energy Savings (MWh)','SAIFI Major Events', 'CAIDI w/o Major Events','CAIDI w/ Major Events','Commercial Generation Rank','Total Generation Rank','Purchased Rank','City Emissions Level','Total MWh Rank']]

minimum_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9710 entries, 0 to 9712
Data columns (total 17 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                9710 non-null   object 
 1   County                               9488 non-null   object 
 2   Risk_Score                           9710 non-null   object 
 3   Risk_Rating                          9710 non-null   object 
 4   IXP_Count                            9710 non-null   float64
 5   Region                               9695 non-null   object 
 6   Regional Electricity Demand          7672 non-null   object 
 7   State Emissions Level                8494 non-null   object 
 8   Utility Annual Energy Savings (MWh)  8494 non-null   object 
 9   SAIFI Major Events                   8494 non-null   object 
 10  CAIDI w/o Major Events               8494 non-null   float64
 11  CAIDI w/ Major Events              

In [152]:
min_features = merged_8[['State','County', 'Risk_Score', 'Risk_Rating', 'IXP_Count','Region','Regional Electricity Demand','State Emissions Level','Utility Annual Energy Savings (MWh)','SAIFI Major Events', 'CAIDI w/o Major Events','CAIDI w/ Major Events','Commercial Generation Rank','Total Generation Rank']]


min_features.dropna(inplace=True)
min_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7455 entries, 60 to 9556
Data columns (total 14 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                7455 non-null   object 
 1   County                               7455 non-null   object 
 2   Risk_Score                           7455 non-null   object 
 3   Risk_Rating                          7455 non-null   object 
 4   IXP_Count                            7455 non-null   float64
 5   Region                               7455 non-null   object 
 6   Regional Electricity Demand          7455 non-null   object 
 7   State Emissions Level                7455 non-null   object 
 8   Utility Annual Energy Savings (MWh)  7455 non-null   object 
 9   SAIFI Major Events                   7455 non-null   object 
 10  CAIDI w/o Major Events               7455 non-null   float64
 11  CAIDI w/ Major Events             

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  min_features.dropna(inplace=True)


In [153]:
min_features.to_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Data/Minimum_Features.csv')

In [146]:
merged_8[merged_8['City'].isna()]


Unnamed: 0,City,State,Total Generation Rank,Electric Generation Rank,Commercial Generation Rank,Capacity Growth Status,CAIDI w/ Major Events,CAIDI w/o Major Events,SAIFI Major Events,BA Code,...,Purchased Rank,County,Zip Code,City Emissions Level,County Emissions Level,State Emissions Level,Gigabit_Fiber_Coverage,Business_Density,Risk_Score,Risk_Rating
0,,AK,,,,,,,,,...,,Aleutians East,,,,,,,4.231626,Very Low
2,,AK,,,,,,,,,...,,Aleutians West,,,,,,,35.221126,Very Low
4,,AK,,,,,,,,,...,,Anchorage,,,,,,,94.845689,Relatively Moderate
6,,AK,,,,,,,,,...,,Bethel,,,,,,,16.926503,Very Low
8,,AK,,,,,,,,,...,,Bristol Bay,,,,,,,0.159084,Very Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9703,,WY,,,,,,,,,...,,Sweetwater,,,,,,,6.617881,Very Low
9705,,WY,,,,,,,,,...,,Teton,,,,,,,68.501432,Relatively Low
9707,,WY,,,,,,,,,...,,Uinta,,,,,,,21.699014,Very Low
9709,,WY,,,,,,,,,...,,Washakie,,,,,,,2.513522,Very Low
