In [140]:
import os
import numpy as np
import pandas as pd

In [141]:
# Columns to keep.
usecols = [
    'Application Id',
    'Utility',
    'Service City',
    'Service Zip',
    'Service County',
    'Technology Type',
    'System Size DC',
    'System Size AC',
    'Inverter Size (kW AC)',
    'Tilt',
    'Azimuth',
    'Mounting Method',
    'Tracking',
    'Customer Sector',
    'App Approved Date',
    'Total System Cost',
    'Itc Cost Basis',
    'NEM Tariff',
    'Interconnection Program',
    'VNEM, NEM-V, NEM-Agg',
    'Project is VNEM, NEM-V, NEM-Agg?',
    'NEMPV or nonNEMPV'
]

# Utilities. 
utilities = [
    'PGE',
    'SDGE',
    'SCE'
]

# Technologies in which solar is the sole means of generation.
# NOTE: should we include sites with storage?
technologies = [
    'Solar PV',
    'Solar',
    'Solar PV, Storage',
    'Solar PV;Storage',
    'SOLAR PV',
    'Other, Solar PV',
    'Other, Solar PV, Storage'
]

In [142]:
# Directory path. 
dir = 'data/Interconnected_Project_Sites_2023-03-31/'

# Dataframe for sites. 
df = pd.DataFrame()

# Combine interconnection data from all utilities.
for file in os.listdir(dir):

    # Update path.
    path = os.path.join(dir, file)

    # Read data from one utility into dataframe. 
    subset = pd.read_csv(path, usecols=usecols)

    # Append. 
    df = pd.concat([df, subset])

  subset = pd.read_csv(path, usecols=usecols)
  subset = pd.read_csv(path, usecols=usecols)
  subset = pd.read_csv(path, usecols=usecols)


In [143]:
# Filter data by utility, customer sector, tariff structure, and technology type.
df = df.loc[
    (df['Utility'].isin(utilities)) &
    (df['Customer Sector'] == 'Residential') &
    (df['NEMPV or nonNEMPV'] == 'NEMPV') & 
    (df['Technology Type'].isin(technologies))
]

In [144]:
# Convert to uppercase.
df['Application Id'] = df['Application Id'].str.upper()
df['Utility'] = df['Utility'].str.upper()
df['Service City'] = df['Service City'].str.upper()
df['Service County'] = df['Service County'].str.upper()
df['Technology Type'] = df['Technology Type'].str.upper()
df['Mounting Method'] = df['Mounting Method'].str.upper()
df['Tracking'] = df['Tracking'].str.upper()
df['Customer Sector'] = df['Customer Sector'].str.upper()
df['Interconnection Program'] = df['Interconnection Program'].str.upper()
df['VNEM, NEM-V, NEM-Agg'] = df['VNEM, NEM-V, NEM-Agg'].str.upper()
df['Project is VNEM, NEM-V, NEM-Agg?'] = df['Project is VNEM, NEM-V, NEM-Agg?'].str.upper()
df['NEMPV or nonNEMPV'] = df['NEMPV or nonNEMPV'].str.upper()

In [145]:
# Check NaNs.
df.isna().sum()

Application Id                           0
Utility                                  0
Service City                             0
Service Zip                              0
Service County                           0
Technology Type                          0
System Size DC                           0
System Size AC                           0
Inverter Size (kW AC)               554261
Tilt                                118751
Azimuth                             118751
Mounting Method                     197722
Tracking                            144276
Customer Sector                          0
App Approved Date                        0
Total System Cost                   531230
Itc Cost Basis                      565574
NEM Tariff                               0
Interconnection Program                  0
VNEM, NEM-V, NEM-Agg                     0
Project is VNEM, NEM-V, NEM-Agg?         0
NEMPV or nonNEMPV                        0
dtype: int64

In [146]:
# Fill relevant NaNs.
df['Mounting Method'].fillna(value='ROOFTOP', inplace=True)
df['Tracking'].fillna(value='FIXED', inplace=True)

In [147]:
# Check data types.
df.dtypes

Application Id                       object
Utility                              object
Service City                         object
Service Zip                         float64
Service County                       object
Technology Type                      object
System Size DC                      float64
System Size AC                      float64
Inverter Size (kW AC)               float64
Tilt                                 object
Azimuth                              object
Mounting Method                      object
Tracking                             object
Customer Sector                      object
App Approved Date                    object
Total System Cost                   float64
Itc Cost Basis                      float64
NEM Tariff                          float64
Interconnection Program              object
VNEM, NEM-V, NEM-Agg                 object
Project is VNEM, NEM-V, NEM-Agg?     object
NEMPV or nonNEMPV                    object
dtype: object

In [148]:
# Clean data.
df.loc[df['Technology Type'] == 'SOLAR', ['Technology Type']] = 'SOLAR PV'
df.loc[df['Technology Type'] == 'OTHER, SOLAR PV', ['Technology Type']] = 'SOLAR PV'
df.loc[df['Technology Type'] == 'SOLAR PV;STORAGE', ['Technology Type']] = 'SOLAR PV, STORAGE'
df.loc[df['Technology Type'] == 'OTHER, SOLAR PV, STORAGE', ['Technology Type']] = 'SOLAR PV, STORAGE'
df.loc[df['Mounting Method'] == 'MULTIPLE', ['Mounting Method']] = 'MIXED'
df.loc[df['Tracking'] == 'MULTIPLE', ['Tracking']] = 'MIXED'
df.loc[df['Tracking'] == 'TRACKING', ['Tracking']] = 'SINGLE-AXIS'

# Remove negative values.
df['System Size DC'] = df['System Size DC'].abs()
df['System Size AC'] = df['System Size AC'].abs()
df['Inverter Size (kW AC)'] = df['Inverter Size (kW AC)'].abs()

In [149]:
# Describe categorical variables.
df.describe(include=object)

Unnamed: 0,Application Id,Utility,Service City,Service County,Technology Type,Tilt,Azimuth,Mounting Method,Tracking,Customer Sector,App Approved Date,Interconnection Program,"VNEM, NEM-V, NEM-Agg","Project is VNEM, NEM-V, NEM-Agg?",NEMPV or nonNEMPV
count,1500393,1500393,1500393,1500393,1500393,1381642.0,1381642.0,1500393,1500393,1500393,1500393,1500393,1500393,1500393,1500393
unique,1500393,3,1182,54,2,268.0,639.0,3,4,1,6694,39,4,2,1
top,PGE-INT-106960619,PGE,SAN DIEGO,SAN DIEGO,SOLAR PV,18.0,180.0,ROOFTOP,FIXED,RESIDENTIAL,2022-10-31,SNEM,NONE,NO,NEMPV
freq,1,700084,79179,235666,1437192,280770.0,312715.0,1476832,1493433,1500393,1568,656767,1492022,1492022,1500393


In [150]:
# Check values.
df['Technology Type'].value_counts()

SOLAR PV             1437192
SOLAR PV, STORAGE      63201
Name: Technology Type, dtype: int64

In [151]:
# Check values.
df['Mounting Method'].value_counts(dropna=False)

ROOFTOP    1476832
GROUND       21654
MIXED         1907
Name: Mounting Method, dtype: int64

In [152]:
# Check values.
df['Tracking'].value_counts(dropna=False)

FIXED          1493433
SINGLE-AXIS       3828
MIXED             2249
DUAL-AXIS          883
Name: Tracking, dtype: int64

In [153]:
# Describe numeric variables.
df.describe(include=np.number)

Unnamed: 0,Service Zip,System Size DC,System Size AC,Inverter Size (kW AC),Total System Cost,Itc Cost Basis,NEM Tariff
count,1500393.0,1500393.0,1500393.0,946132.0,969163.0,934819.0,1500393.0
mean,93353.8,6.412854,6.03356,5.171258,24035.79,10230.01,1.642279
std,1435.875,5.444554,5.141241,13.589377,24035.28,17279.01,0.4793296
min,90001.0,0.0,0.002,0.0,0.0,0.0,1.0
25%,92129.0,4.015,3.779,3.0,12667.0,0.0,1.0
50%,93250.0,5.67,5.32,4.6,22000.0,0.0,2.0
75%,94565.0,7.92,7.459,7.2,32988.22,18270.0,2.0
max,96137.0,1106.56,998.8,8149.412,3129000.0,2271734.0,2.0


In [154]:
# Check numbers.
if len(df.loc[df['System Size DC'] < 0]) > 0: raise ValueError('System Size DC cannot be negative.')
if len(df.loc[df['System Size AC'] < 0]) > 0: raise ValueError('System Size AC cannot be negative.')
if len(df.loc[df['Inverter Size (kW AC)'] < 0]) > 0: raise ValueError('Inverter Size (kW AC) cannot be negative.')