# Summary statistics and kernel density estimation for weather data pulled using the Dark Sky API

This notebook contains some light cleaning and exploratory data analysis using DC weather data from 2010 - 2017.


* Variable definitions - https://darksky.net/dev/docs#response-format



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KernelDensity
import seaborn as sns

df = pd.read_csv(r'~\git\Bikeshare-DC\data\Dark_Sky_2010_2017.csv', parse_dates=[0], infer_datetime_format=True)
df.shape

(2922, 27)

In [2]:
# creating weather dummies
df = pd.concat([df, pd.get_dummies(df['precipType'])], axis=1)
df.rename(columns = {'rain': 'rain_dummy','snow': 'snow_dummy'}, inplace = True)

# converting Unix time to human-readable time
timevars = ['apparentTemperatureHighTime','apparentTemperatureLowTime','precipIntensityMaxTime',
           'sunriseTime','sunsetTime','temperatureHighTime','temperatureLowTime', 'time']
for var in timevars:
    df[var] = pd.to_datetime(df[var],unit='s')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2922 entries, 0 to 2921
Data columns (total 29 columns):
date                           2922 non-null datetime64[ns]
apparentTemperatureHigh        2922 non-null float64
apparentTemperatureHighTime    2922 non-null datetime64[ns]
apparentTemperatureLow         2922 non-null float64
apparentTemperatureLowTime     2922 non-null datetime64[ns]
cloudCover                     2916 non-null float64
dewPoint                       2922 non-null float64
humidity                       2922 non-null float64
moonPhase                      2922 non-null float64
precipAccumulation             124 non-null float64
precipIntensity                2922 non-null float64
precipIntensityMax             2922 non-null float64
precipIntensityMaxTime         1590 non-null datetime64[ns]
precipProbability              2922 non-null float64
precipType                     1453 non-null object
pressure                       2922 non-null float64
sunriseTime        

In [None]:
# summary statistics
print(df.describe())
df.describe()

In [None]:
# KDE with Scikit-Learn
def kde_sklearn(x, x_grid, bandwidth=0.2):
    kde_skl = KernelDensity(bandwidth=bandwidth)
    kde_skl.fit(x[:, np.newaxis])
    # score_samples returns log-likelihood of samples
    pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    return np.exp(pdf)

def density(column, bandwidth=0.2):   
    x_grid = np.linspace(df.loc[:, column].min(), df.loc[:, column].max())
    x = df.loc[:, column]
    pdf = kde_sklearn(x, x_grid, bandwidth=bandwidth)
    ax.plot(x_grid, pdf, color='blue', alpha=0.5, lw=1)
    ax.set_title('KDE for {}'.format(column))
    plt.show()

In [None]:
%matplotlib inline
# can use the density function to produce KDEs for any single variable
# nonparametric estimation of the pdf

fig, ax = plt.subplots()
density('daylightHours', bandwidth=1)

In [None]:
%matplotlib inline
fig, ax = plt.subplots()
density('apparentTemperatureHigh', bandwidth=4)

In [None]:
%matplotlib inline
fig, ax = plt.subplots()
density('apparentTemperatureLow', bandwidth=4)

In [None]:
%matplotlib inline
fig, ax = plt.subplots()
density('visibility', bandwidth=1)
# in miles

In [None]:
%matplotlib inline
fig, ax = plt.subplots()
density('windSpeed', bandwidth=1)

In [None]:
%matplotlib inline
sns.pairplot(df)