# Solar Electric Programs Reported by NYSERDA
---


**Purpose:**

* End-to-end exploratory data analysis
* Explore challenges with using dates

**Data Source:**

https://data.ny.gov/

In [None]:
# system libraries
import os
import sys

# check system information
print('Python Information', sys.version)
print('This is your current directory', os.getcwd())

In [None]:
# datetime libraries
import datetime

# assgin current date and time
currentDate = datetime.date.today()
currentTime = datetime.datetime.now()

# check datetime information
print('Today is {}'.format(currentDate))
print('Today is', datetime.datetime.strftime(currentDate, '%m/%d/%Y'))
print('The time is', datetime.datetime.strftime(currentTime, '%H:%M:%S'))

In [None]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.precision',2)
pd.set_option('max_rows',100)
pd.set_option('max_columns',100)

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('dark')

print(pd.__version__)
print(np.__version__)
print(sns.__version__)

In [None]:
from dateutil import parser
import glob

In [None]:
dates = ['07/31/2016', '7/31/16', '31-07-2016', '31-JUL-2016', '2016-07-31', 'July 31st, 2016', 'Jul.16', 'July.2016']

for d in dates:
    print(parser.parse(d))

In [None]:
dates = ['Reporting Period', 'Date Application Received', 'Date Completed']

filename = 'https://data.ny.gov/api/views/3x8r-34rs/rows.csv?accessType=DOWNLOAD&sorting=true'

df = pd.read_csv(filename, parse_dates=dates)
df.info()

In [None]:
df.columns = df.columns.str.lower().str.replace(" ","_").str.replace("$","")
df.columns

In [None]:
df.head(10)

In [None]:
df.head(3).T

In [None]:
df['electric_utility'].value_counts(dropna=False)

In [None]:
def assignments(x):
    return {
        'Consolidated Edison': 'CONED',
        'National Grid': 'NATGRID',
        'PSEG Long Island': 'PSEGLI',
        'Orange and Rockland Utilities':'ORU',
    }.get(x, 'Other')

In [None]:
df['utility'] = df['electric_utility'].apply(assignments)
df['utility'].value_counts()

In [None]:
df.filter(regex='date').isna().sum()

In [None]:
df['application_year'] = df['date_application_received'].dt.year
df['application_month'] = df['date_application_received'].dt.month

In [None]:
pivot = df.pivot_table(
    index='application_year',
    columns='application_month',
    values='project_cost',
    aggfunc=np.sum,
    fill_value=0
)
pivot

In [None]:
idx = 'application_year'

df[[idx, 'project_cost', 'incentive']]\
.groupby(idx).sum()\
.plot.bar(stacked=True, figsize=(14,6), width=.75, rot=0);

In [None]:
df.dropna(subset=['date_completed'], inplace=True)
df['date_application_received'] = df['date_application_received'].astype(str).apply(lambda x: x[0:10])
df['date_completed'] = df['date_completed'].astype(str).apply(lambda x: x[0:10])

In [None]:
business_days = []

for row in df.itertuples():
    business_days.append(np.busday_count(row.date_application_received,row.date_completed))
    
df['business_days'] = business_days

df.head()

In [None]:
sns.distplot(df['business_days']);

In [None]:
g = sns.FacetGrid(data=df, col="utility",)
g = g.map(sns.distplot, "business_days")

In [None]:
sns.boxenplot(data=df, x='utility', y='business_days', color='red');

In [None]:
df.query("business_days < 0")

In [None]:
df['business_days'].describe()

In [None]:
cut_days = pd.cut(df['business_days'], [-1000,0,10,20,30,60,90,360,3000])

df.pivot_table(index=cut_days, columns='utility', values='project_cost', aggfunc=np.sum, fill_value=0)

In [None]:
df.groupby('project_number')[['project_cost', 'expected_kwh_annual_production']].sum().corr()

In [None]:
ax = sns.regplot(x=df['project_cost'], y=df['expected_kwh_annual_production'], color="b", dropna=True);

In [None]:
# Show the results of a linear regression within each dataset
sns.lmplot(
    x='project_cost',
    y='expected_kwh_annual_production',
    col='utility',
    hue='sector',
    col_wrap=3,
    ci=None,
    palette='muted',
    height=4,
    scatter_kws={'s': 50, 'alpha': .4},
    data=df.query("application_year == 2019"),
);

In [None]:
df['utility'].unique()

In [None]:
for utility in df['utility'].unique():
    df.query(f"utility == '{utility}'").to_csv(f"solar_df_{utility}.csv", index=None)

In [None]:
glob.glob('solar_df*.csv')

In [None]:
counties = df.county.value_counts(dropna=False).reset_index()
counties.rename(columns = {'index':'county', 'county':'total'}, inplace=True)
counties

In [None]:
filename = 'https://data.ny.gov/api/views/4xc7-bukh/rows.csv?accessType=DOWNLOAD&sorting=true'

cs = pd.read_csv(filename)
cs.columns = cs.columns.str.replace(' ','-').str.lower()
cs.info()

In [None]:
counties = pd.merge(left=counties, right=cs, how='inner')
counties

In [None]:
import folium

In [None]:
# find the middle of the map
median_lat = counties['latitude'].median()
median_lon = counties['longitude'].median()

# create the map
m = folium.Map(location=(median_lat,median_lon), zoom_start=7, tiles='CartoDB Positron')

# add markers to the map
for index, row in counties.iterrows():
    color_match = "green" if row['total'] > 7500 else 'blue'
    tool_tip = "County: {}, Total: {:,}"\
    .format(row['county'], row['total'])
    folium.Marker(location=(row['latitude'], row['longitude']),
                  tooltip=tool_tip,
                  icon=folium.Icon(color=color_match, icon='info-sign'),
                  opacity=.4).add_to(m)

# show the map
m