In [272]:
import os
import numpy as np
import pandas as pd

In [273]:
# Columns to keep.
usecols = [
    'Application Id',
    'Utility',
    'Service City',
    'Service Zip',
    'Service County',
    'Technology Type',
    'System Size DC',
    'System Size AC',
    'Inverter Size (kW AC)',
    'Tilt',
    'Azimuth',
    'Mounting Method',
    'Tracking',
    'Customer Sector',
    'App Approved Date',
    'Total System Cost',
    'Itc Cost Basis',
    'NEM Tariff',
    'Interconnection Program',
    'VNEM, NEM-V, NEM-Agg',
    'Project is VNEM, NEM-V, NEM-Agg?',
    'NEMPV or nonNEMPV'
]

In [274]:
# CAISO interconnection data. 
pge_sites_file = 'data/Interconnected_Project_Sites_2023-03-31/PGE_Interconnected_Project_Sites_2023-03-31.csv'

# Import sites.
df = pd.read_csv(pge_sites_file, usecols=usecols)

  df = pd.read_csv(pge_sites_file, usecols=usecols)


In [275]:
# Filter data by customer sector, technology type, and tariff structure.
df = df.loc[(df['Customer Sector'] == 'Residential') & (df['Technology Type'] == 'Solar PV') & (df['NEMPV or nonNEMPV'] == 'NEMPV')]

In [276]:
# Check NaNs.
df.isna().sum()

Application Id                           0
Utility                                  0
Service City                             0
Service Zip                              0
Service County                           0
Technology Type                          0
System Size DC                           0
System Size AC                           0
Inverter Size (kW AC)                    0
Tilt                                     0
Azimuth                                  0
Mounting Method                     150798
Tracking                             97360
Customer Sector                          0
App Approved Date                        0
Total System Cost                   150723
Itc Cost Basis                      150723
NEM Tariff                               0
Interconnection Program                  0
VNEM, NEM-V, NEM-Agg                     0
Project is VNEM, NEM-V, NEM-Agg?         0
NEMPV or nonNEMPV                        0
dtype: int64

In [277]:
# Fill NaNs.
df['Mounting Method'].fillna(value='Rooftop', inplace=True)
df['Tracking'].fillna(value='Mixed', inplace=True)
df['Total System Cost'].fillna(value=0.0, inplace=True)
df['Itc Cost Basis'].fillna(value=0.0, inplace=True)

In [278]:
# Confirm NaNs.
df.isna().sum()

Application Id                      0
Utility                             0
Service City                        0
Service Zip                         0
Service County                      0
Technology Type                     0
System Size DC                      0
System Size AC                      0
Inverter Size (kW AC)               0
Tilt                                0
Azimuth                             0
Mounting Method                     0
Tracking                            0
Customer Sector                     0
App Approved Date                   0
Total System Cost                   0
Itc Cost Basis                      0
NEM Tariff                          0
Interconnection Program             0
VNEM, NEM-V, NEM-Agg                0
Project is VNEM, NEM-V, NEM-Agg?    0
NEMPV or nonNEMPV                   0
dtype: int64

In [279]:
# Check data types.
df.dtypes

Application Id                       object
Utility                              object
Service City                         object
Service Zip                         float64
Service County                       object
Technology Type                      object
System Size DC                      float64
System Size AC                      float64
Inverter Size (kW AC)               float64
Tilt                                 object
Azimuth                              object
Mounting Method                      object
Tracking                             object
Customer Sector                      object
App Approved Date                    object
Total System Cost                   float64
Itc Cost Basis                      float64
NEM Tariff                          float64
Interconnection Program              object
VNEM, NEM-V, NEM-Agg                 object
Project is VNEM, NEM-V, NEM-Agg?     object
NEMPV or nonNEMPV                    object
dtype: object

In [280]:
# Clean data.
df.loc[df['Mounting Method'] == 'multiple', ['Mounting Method']] = 'Mixed'
df.loc[df['Tracking'] == 'multiple', ['Tracking']] = 'Mixed'
df.loc[df['Tracking'] == 'Tracking', ['Tracking']] = 'Single-Axis'

# Remove negative values.
df['System Size DC'] = df['System Size DC'].abs()
df['System Size AC'] = df['System Size AC'].abs()
df['Inverter Size (kW AC)'] = df['Inverter Size (kW AC)'].abs()

In [281]:
# Describe categorical variables.
df.describe(include=object)

Unnamed: 0,Application Id,Utility,Service City,Service County,Technology Type,Tilt,Azimuth,Mounting Method,Tracking,Customer Sector,App Approved Date,Interconnection Program,"VNEM, NEM-V, NEM-Agg","Project is VNEM, NEM-V, NEM-Agg?",NEMPV or nonNEMPV
count,659977,659977,659977,659977,659977,659977.0,659977.0,659977,659977,659977,659977,659977,659977.0,659977,659977
unique,659977,1,1397,55,1,104.0,376.0,3,4,1,6025,9,4.0,2,1
top,PGE-INT-107036420,PGE,BAKERSFIELD,FRESNO,Solar PV,18.0,180.0,Rooftop,Fixed,Residential,2022-07-15,SNEM,,No,NEMPV
freq,1,659977,45911,72844,659977,117829.0,122822.0,645420,559748,659977,850,656691,655513.0,655513,659977


In [282]:
# Check values.
df['Mounting Method'].value_counts(dropna=False)

Rooftop    645420
Ground      13583
Mixed         974
Name: Mounting Method, dtype: int64

In [283]:
# Check values.
df['Tracking'].value_counts(dropna=False)

Fixed          559748
Mixed           98214
Single-Axis      1411
Dual-Axis         604
Name: Tracking, dtype: int64

In [284]:
# Describe numeric variables.
df.describe(include=np.number)

Unnamed: 0,Service Zip,System Size DC,System Size AC,Inverter Size (kW AC),Total System Cost,Itc Cost Basis,NEM Tariff
count,659977.0,659977.0,659977.0,659977.0,659977.0,659977.0,659977.0
mean,94646.504366,6.216334,6.042783,5.710074,15195.52,6143.954,1.630489
std,852.846767,4.597361,4.484551,11.486298,20955.02,13657.96,0.482673
min,92263.0,0.0,0.002,0.0,0.0,0.0,1.0
25%,93727.0,3.822,3.722,3.747,0.0,0.0,1.0
50%,94583.0,5.481,5.326,5.0,11154.0,0.0,2.0
75%,95361.0,7.722,7.517,7.494,25920.0,0.0,2.0
max,96137.0,766.08,751.005,8149.412,2995626.0,1182000.0,2.0


In [288]:
# Check numbers.
if len(df.loc[df['System Size DC'] < 0]) > 0: raise ValueError('System Size DC cannot be negative.')
if len(df.loc[df['System Size AC'] < 0]) > 0: raise ValueError('System Size AC cannot be negative.')
if len(df.loc[df['Inverter Size (kW AC)'] < 0]) > 0: raise ValueError('Inverter Size (kW AC) cannot be negative.')