# Unsupervised Analysis of Days of Week

Treating crossing each day as features to learn about the relatinships between various days.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt


## Downloading Data

We'll start by downloading the data (available on [seattle.gov](http://www.seattle.gov/transportation/bikecounter_fremont.htm)).

In [None]:
from urllib import request

FREMONT_URL = 'https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD'

request.urlretrieve(FREMONT_URL, 'Fremont.csv')

In [None]:
# magic function to show the content of the file
%more Fremont.csv

In [None]:
import pandas as pd

df = pd.read_csv('Fremont.csv')  # use read_csv to load the data into dataframe
df.head()

In [None]:
#  Let's see the type of the data
df.dtypes

In [None]:
# change the Date column to datetime data type
df['Date'] = pd.to_datetime(df['Date'])
df.head()

In [None]:
df.dtypes

In [None]:
# Set the index to Date
df.set_index('Date', inplace=True)
df.head()

In [None]:
df.apply(lambda x: sum(x.isnull()))

In [None]:
# clear the data by delete the non-numeric
df.dropna(inplace=True)

In [None]:
df.apply(lambda x: sum(x.isnull()))

In [None]:
df.columns

In [None]:
df.plot()

In [None]:
df.resample('W').sum().plot()

In [None]:
df.columns=['West', 'East']

In [None]:
df.resample('w').sum().plot()

In [None]:
# To see whether there is any annual trend of the number of rides
df.resample('D').sum().rolling(365).sum().plot()  
# each point is the sum of the number of rides in the previuos 365 days

In [None]:
# The y coordinate is not from 0
ax = df.resample('D').sum().rolling(365).sum().plot()
ax.set_ylim(0, None)

In [None]:
# DateimeIndex.time return numpy array of datetime.time, the time part of the Timestamps
df.groupby(df.index.time).mean().plot()
# plot the average of rides at each hours of the day

In [None]:
# Create the pivoted table to investigate the pattern in each day
df['Total'] = df['West'] + df['East']
pivoted = df.pivot_table(values='Total', index=df.index.time, columns=df.index.date)
pivoted.head()

In [None]:
pivoted.shape

In [None]:
# delete the date with non-numeric
pivoted.dropna(axis=1, inplace=True)
pivoted.shape

In [None]:
pivoted.plot(legend=False)

In [None]:
# add transparent parameter alpha
pivoted.plot(legend=False, alpha=0.01)

## Principal Component Analysis

In [None]:
# Get X with hours as mearsurement and date as observations
X = pivoted.T.values
X.shape

In [None]:
X

In [None]:
from sklearn.decomposition import PCA

X2 = PCA(2, svd_solver='full').fit_transform(X)

In [None]:
X2

In [None]:
X2.shape

In [None]:
plt.scatter(X2[:, 0], X2[:, 1])

In [None]:
# use cluster algorithm Gaussian mixture model
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(2)
gmm.fit(X)
labels = gmm.predict(X)
labels


In [None]:
# plt.scatter(X2[:, 0], X2[:, 1], c=labels, cmap='rainbow')
# plt.colorbar()
plt.scatter(X2[:, 0], X2[:, 1], c=labels)
plt.colorbar()

In [None]:
labels

In [None]:
# so labels == 1 represents the weekday
pivoted.T[labels == 1].T.plot(legend=False, alpha=0.01)

In [None]:
# labels == 0 represents the weekend or holiday
pivoted.T[labels == 0].T.plot(legend=False, alpha=0.1)

## Comparing with Day of Week

In [None]:
pd.DatetimeIndex(pivoted.columns)

In [None]:
# The DatetimeIndex.dayof week gives the day of the week
dayofweek = pd.DatetimeIndex(pivoted.columns).dayofweek
dayofweek

In [None]:
# Then we plot the color of the weekday
plt.scatter(X2[:, 0], X2[:, 1], c=dayofweek)
plt.colorbar() 

In [None]:
# grab the day in label 0 which is not weekend
dates = pd.DatetimeIndex(pivoted.columns)
dates[(labels == 0) & (dayofweek < 5)]

What's up with Feb 6, 2017?

[Snow Storm](https://www.seattletimes.com/seattle-news/weather/weather-service-predicts-3-to-6-inches-of-snow-in-seattle-area/)