## Resampling

Summarize Time Series Data

In [1]:
# Import Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
%%time
# Import our data into pandas from CSV
bike_trip_data_path = 'Resources/trip.csv'
# Parse date column, drop Nan, drop duplicates
bike_trips_df = pd.read_csv(bike_trip_data_path, low_memory=False, parse_dates=['stoptime']).dropna().drop_duplicates()

bike_trips_df.head()

Wall time: 15 s


Unnamed: 0,stoptime,bikeid,tripduration,from_station_name,to_station_name,from_station_id,to_station_id,usertype,gender,birthyear
0,2014-10-13 10:48:00,SEA00298,985.935,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Male,1960
1,2014-10-13 10:48:00,SEA00195,926.375,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Male,1970
2,2014-10-13 10:48:00,SEA00486,883.831,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Female,1988
3,2014-10-13 10:48:00,SEA00333,865.937,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Female,1977
4,2014-10-13 10:49:00,SEA00202,923.923,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Male,1971


In [3]:
bike_trips_df.shape

(146168, 10)

In [72]:
# Number of records
bike_trips_df.shape

(146168, 10)

In [4]:
bike_trips_df.dtypes

stoptime             datetime64[ns]
bikeid                       object
tripduration                float64
from_station_name            object
to_station_name              object
from_station_id              object
to_station_id                object
usertype                     object
gender                       object
birthyear                    object
dtype: object

In [6]:
# Display entire column (set max width to None)
pd.set_option('display.max_colwidth', None)

In [10]:
# Transpose to see side to side
bike_trips_df.head(2).T

Unnamed: 0,0,1
stoptime,2014-10-13 10:48:00,2014-10-13 10:48:00
bikeid,SEA00298,SEA00195
tripduration,985.935,926.375
from_station_name,2nd Ave & Spring St,2nd Ave & Spring St
to_station_name,Occidental Park / Occidental Ave S & S Washington St,Occidental Park / Occidental Ave S & S Washington St
from_station_id,CBD-06,CBD-06
to_station_id,PS-04,PS-04
usertype,Member,Member
gender,Male,Male
birthyear,1960,1970


## Goal

Simplify our data 

![images/ridership_goal.jpg](images/ridership_goal.jpg)

## Step 1

Group by Gender and stoptime, reset index to end up with dataframe of gender, stoptime, and tripduration columns

![Group by gender and stoptime](images/group_by_gender_stoptime.jpg)

In [16]:
# Count instances, select tripduration as column to get Series 
gender_stoptime = bike_trips_df.groupby(["gender", "stoptime"]).count()['tripduration']

In [22]:
# Convert to dataframe by resetting index
gender_stoptime = gender_stoptime.reset_index().rename(columns={"tripduration":"number_of_trips"})

In [28]:
gender_stoptime.sample(5)

Unnamed: 0,gender,stoptime,number_of_trips
51028,Male,2015-03-30 18:45:00,2
64663,Male,2015-06-15 16:05:00,1
74122,Male,2015-07-31 08:11:00,2
43840,Male,2015-02-07 02:25:00,1
4138,Female,2015-02-13 15:21:00,1


In [87]:
# Verify count from original dataframe 
bike_trips_df.query("stoptime=='2016-04-12 15:57:00'")

Unnamed: 0,stoptime,bikeid,tripduration,from_station_name,to_station_name,from_station_id,to_station_id,usertype,gender,birthyear
233035,2016-04-12 15:57:00,SEA00489,526.221,Terry Ave & Stewart St,Dexter Ave N & Aloha St,SLU-20,SLU-02,Member,Female,1987
233036,2016-04-12 15:57:00,SEA00385,472.005,Pier 69 / Alaskan Way & Clay St,Occidental Park / Occidental Ave S & S Washington St,WF-01,PS-04,Member,Male,1964
233037,2016-04-12 15:57:00,SEA00171,454.8,Union St & 4th Ave,REI / Yale Ave N & John St,CBD-04,SLU-01,Member,Male,1983


### Step 2 - Create date indexed dataframe

Create a dataframe with stoptime as index and columns with counts for each gender

![Pivot](images/pivot_stoptime_gender.jpg)

In [29]:
gender_stoptime.sample(5)

Unnamed: 0,gender,stoptime,number_of_trips
108122,Male,2016-04-04 17:48:00,1
42980,Male,2015-01-30 16:59:00,1
11658,Female,2015-06-29 17:31:00,1
76808,Male,2015-08-13 10:46:00,1
121892,Male,2016-07-19 17:03:00,1


In [30]:
# Create a pivot table with the 'stoptime' as the index and the columns ='gender' with the number_of_trips in each row.
gender_stoptime_pivot = gender_stoptime.pivot(index="stoptime", columns="gender", values="number_of_trips")
# drop stoptime column from columns
del gender_stoptime_pivot['stoptime']
# display
gender_stoptime_pivot.sample(10)

gender,Female,Male,Other
stoptime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-05-14 09:39:00,,1.0,
2015-08-23 01:54:00,,1.0,
2014-11-23 14:48:00,,1.0,
2015-07-13 19:22:00,,2.0,
2015-02-02 10:23:00,,1.0,
2016-06-27 13:28:00,,1.0,
2015-07-22 10:39:00,,1.0,
2014-10-28 21:30:00,1.0,,
2016-03-08 12:02:00,,1.0,
2014-12-06 16:07:00,,1.0,


In [31]:
# Verify index is datetime
gender_stoptime_pivot.index

DatetimeIndex(['2014-10-13 10:47:00', '2014-10-13 10:48:00',
               '2014-10-13 10:49:00', '2014-10-13 11:45:00',
               '2014-10-13 11:47:00', '2014-10-13 11:49:00',
               '2014-10-13 11:51:00', '2014-10-13 11:52:00',
               '2014-10-13 11:55:00', '2014-10-13 11:59:00',
               ...
               '2016-08-31 21:00:00', '2016-08-31 21:15:00',
               '2016-08-31 21:30:00', '2016-08-31 21:32:00',
               '2016-08-31 21:46:00', '2016-08-31 22:04:00',
               '2016-08-31 22:17:00', '2016-08-31 22:25:00',
               '2016-08-31 22:31:00', '2016-08-31 22:39:00'],
              dtype='datetime64[ns]', name='stoptime', length=122845, freq=None)

### Step 3 - Resample data

Change time series data into different frequency.

- H: hour frequency
- D : day frequency
- W : weekly frequency
- M : month frequency
- SM : semi month end frequency (15th and end of month)
- Q : quarter end frequency

![images/resampling_example.jpg](images/resampling_example.jpg)

In [32]:
# Show numbers by the hour
gender_stoptime_pivot.resample('H').sum()

gender,Female,Male,Other
stoptime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-10-13 10:00:00,2.0,4.0,0.0
2014-10-13 11:00:00,10.0,20.0,0.0
2014-10-13 12:00:00,16.0,36.0,2.0
2014-10-13 13:00:00,6.0,20.0,1.0
2014-10-13 14:00:00,14.0,18.0,2.0
...,...,...,...
2016-08-31 18:00:00,4.0,23.0,0.0
2016-08-31 19:00:00,1.0,13.0,0.0
2016-08-31 20:00:00,1.0,3.0,1.0
2016-08-31 21:00:00,1.0,4.0,0.0


In [153]:
# Show numbers by the day
gender_stoptime_pivot.resample('D').sum()

gender,Female,Male,Other
stoptime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-10-13,67.0,207.0,10.0
2014-10-14,65.0,194.0,8.0
2014-10-15,35.0,174.0,4.0
2014-10-16,63.0,199.0,4.0
2014-10-17,47.0,143.0,7.0
...,...,...,...
2016-08-27,10.0,68.0,4.0
2016-08-28,28.0,62.0,6.0
2016-08-29,50.0,189.0,6.0
2016-08-30,49.0,214.0,4.0


In [154]:
# Show numbers by the week
gender_stoptime_pivot.resample('W').sum()

gender,Female,Male,Other
stoptime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-10-19,366.0,1228.0,50.0
2014-10-26,280.0,1114.0,26.0
2014-11-02,296.0,1196.0,33.0
2014-11-09,323.0,1207.0,17.0
2014-11-16,316.0,1004.0,27.0
...,...,...,...
2016-08-07,267.0,1098.0,42.0
2016-08-14,270.0,1197.0,31.0
2016-08-21,296.0,1216.0,38.0
2016-08-28,270.0,1241.0,37.0


In [156]:
# Show numbers by the month
gender_stoptime_pivot.resample('M').sum()

gender,Female,Male,Other
stoptime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-10-31,869.0,3275.0,103.0
2014-11-30,1058.0,4119.0,77.0
2014-12-31,749.0,3599.0,30.0
2015-01-31,1172.0,4581.0,61.0
2015-02-28,992.0,4054.0,86.0
2015-03-31,1554.0,4923.0,108.0
2015-04-30,1615.0,5805.0,117.0
2015-05-31,2174.0,6381.0,127.0
2015-06-30,2062.0,6966.0,159.0
2015-07-31,2071.0,7751.0,215.0


In [157]:
# Show numbers by the year
gender_stoptime_pivot.resample('Y').sum()

gender,Female,Male,Other
stoptime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-12-31,2676.0,10993.0,210.0
2015-12-31,18444.0,68277.0,1541.0
2016-12-31,9206.0,33670.0,1150.0
