In [2]:
# Import Dependencies
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn import datasets
import pandas as pd

In [3]:
# Import data file
divvy_df = pd.read_csv('Divvy_Trips_2019_Q3.csv')
divvy_df.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,23479388,2019-07-01 00:00:27,2019-07-01 00:20:41,3591,1214.0,117,Wilton Ave & Belmont Ave,497,Kimball Ave & Belmont Ave,Subscriber,Male,1992.0
1,23479389,2019-07-01 00:01:16,2019-07-01 00:18:44,5353,1048.0,381,Western Ave & Monroe St,203,Western Ave & 21st St,Customer,,
2,23479390,2019-07-01 00:01:48,2019-07-01 00:27:42,6180,1554.0,313,Lakeview Ave & Fullerton Pkwy,144,Larrabee St & Webster Ave,Customer,,
3,23479391,2019-07-01 00:02:07,2019-07-01 00:27:10,5540,1503.0,313,Lakeview Ave & Fullerton Pkwy,144,Larrabee St & Webster Ave,Customer,,
4,23479392,2019-07-01 00:02:13,2019-07-01 00:22:26,6014,1213.0,168,Michigan Ave & 14th St,62,McCormick Place,Customer,,


In [5]:
divvy_df.shape

(1640718, 12)

In [8]:
# Remove null rows (if needed)
divvy_df.dropna(inplace=True)
divvy_df.shape

(1353368, 13)

In [10]:
# Convert birthyear to age
divvy_df['age'] = (2021 - divvy_df['birthyear'])
divvy_df.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,age
0,23479388,2019-07-01 00:00:27,2019-07-01 00:20:41,3591,1214.0,117,Wilton Ave & Belmont Ave,497,Kimball Ave & Belmont Ave,Subscriber,Male,1992.0,29.0
5,23479393,2019-07-01 00:02:21,2019-07-01 00:07:31,4941,310.0,300,Broadway & Barry Ave,232,Pine Grove Ave & Waveland Ave,Subscriber,Male,1990.0,31.0
18,23479406,2019-07-01 00:06:51,2019-07-01 00:26:22,2758,1171.0,624,Dearborn St & Van Buren St,237,MLK Jr Dr & 29th St,Subscriber,Male,1995.0,26.0
20,23479408,2019-07-01 00:08:24,2019-07-01 00:20:07,2447,703.0,239,Western Ave & Leland Ave,455,Maplewood Ave & Peterson Ave,Subscriber,Male,1970.0,51.0
21,23479409,2019-07-01 00:08:40,2019-07-01 00:27:50,6336,1150.0,257,Lincoln Ave & Waveland Ave,295,Broadway & Argyle St,Subscriber,Male,1993.0,28.0


In [11]:
# Create bins for age
bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]

# Create the names for the five bins
group_names = ["11-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89"]
divvy_df["Age Group"] = pd.cut(divvy_df["age"], bins, labels=group_names, include_lowest=True)
divvy_df.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,age,Age Group
0,23479388,2019-07-01 00:00:27,2019-07-01 00:20:41,3591,1214.0,117,Wilton Ave & Belmont Ave,497,Kimball Ave & Belmont Ave,Subscriber,Male,1992.0,29.0,20-29
5,23479393,2019-07-01 00:02:21,2019-07-01 00:07:31,4941,310.0,300,Broadway & Barry Ave,232,Pine Grove Ave & Waveland Ave,Subscriber,Male,1990.0,31.0,30-39
18,23479406,2019-07-01 00:06:51,2019-07-01 00:26:22,2758,1171.0,624,Dearborn St & Van Buren St,237,MLK Jr Dr & 29th St,Subscriber,Male,1995.0,26.0,20-29
20,23479408,2019-07-01 00:08:24,2019-07-01 00:20:07,2447,703.0,239,Western Ave & Leland Ave,455,Maplewood Ave & Peterson Ave,Subscriber,Male,1970.0,51.0,50-59
21,23479409,2019-07-01 00:08:40,2019-07-01 00:27:50,6336,1150.0,257,Lincoln Ave & Waveland Ave,295,Broadway & Argyle St,Subscriber,Male,1993.0,28.0,20-29


In [12]:
#Review bins
divvy_df['Age Group'].value_counts()

30-39    513181
20-29    494762
40-49    170650
50-59    118804
60-69     45411
11-19      5431
70-79      4601
80-89       178
Name: Age Group, dtype: int64

In [13]:
# Plot bins as bar graph

In [14]:
# Convert date columns to datetime format
divvy_df['start_time'] = pd.to_datetime(divvy_df['start_time'])
divvy_df['end_time'] = pd.to_datetime(divvy_df['end_time'])

In [15]:
# Split date from time and create columns
divvy_df['start_date'] = divvy_df['start_time'].dt.date
divvy_df['start_time'] = divvy_df['start_time'].dt.time
divvy_df['end_date'] = divvy_df['end_time'].dt.date
divvy_df['end_time'] = divvy_df['end_time'].dt.time
divvy_df.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,age,Age Group,start_date,end_date
0,23479388,00:00:27,00:20:41,3591,1214.0,117,Wilton Ave & Belmont Ave,497,Kimball Ave & Belmont Ave,Subscriber,Male,1992.0,29.0,20-29,2019-07-01,2019-07-01
5,23479393,00:02:21,00:07:31,4941,310.0,300,Broadway & Barry Ave,232,Pine Grove Ave & Waveland Ave,Subscriber,Male,1990.0,31.0,30-39,2019-07-01,2019-07-01
18,23479406,00:06:51,00:26:22,2758,1171.0,624,Dearborn St & Van Buren St,237,MLK Jr Dr & 29th St,Subscriber,Male,1995.0,26.0,20-29,2019-07-01,2019-07-01
20,23479408,00:08:24,00:20:07,2447,703.0,239,Western Ave & Leland Ave,455,Maplewood Ave & Peterson Ave,Subscriber,Male,1970.0,51.0,50-59,2019-07-01,2019-07-01
21,23479409,00:08:40,00:27:50,6336,1150.0,257,Lincoln Ave & Waveland Ave,295,Broadway & Argyle St,Subscriber,Male,1993.0,28.0,20-29,2019-07-01,2019-07-01


In [16]:
# Organize columns in dataframe
divvy_df = divvy_df[["trip_id", "bikeid", "start_date", "start_time", "end_date", "end_time", "tripduration", "from_station_id", "from_station_name", "to_station_id", "to_station_name", "usertype", "gender", "birthyear", "age", "Age Group"]]
divvy_df.head()

Unnamed: 0,trip_id,bikeid,start_date,start_time,end_date,end_time,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,age,Age Group
0,23479388,3591,2019-07-01,00:00:27,2019-07-01,00:20:41,1214.0,117,Wilton Ave & Belmont Ave,497,Kimball Ave & Belmont Ave,Subscriber,Male,1992.0,29.0,20-29
5,23479393,4941,2019-07-01,00:02:21,2019-07-01,00:07:31,310.0,300,Broadway & Barry Ave,232,Pine Grove Ave & Waveland Ave,Subscriber,Male,1990.0,31.0,30-39
18,23479406,2758,2019-07-01,00:06:51,2019-07-01,00:26:22,1171.0,624,Dearborn St & Van Buren St,237,MLK Jr Dr & 29th St,Subscriber,Male,1995.0,26.0,20-29
20,23479408,2447,2019-07-01,00:08:24,2019-07-01,00:20:07,703.0,239,Western Ave & Leland Ave,455,Maplewood Ave & Peterson Ave,Subscriber,Male,1970.0,51.0,50-59
21,23479409,6336,2019-07-01,00:08:40,2019-07-01,00:27:50,1150.0,257,Lincoln Ave & Waveland Ave,295,Broadway & Argyle St,Subscriber,Male,1993.0,28.0,20-29


In [17]:
# Number of unique bikes in circulation
divvy_df['bikeid'].nunique()

5783

In [18]:
# Bike IDs with most activity

divvy_df['bikeid'].value_counts()

5344    462
3770    457
717     453
5634    445
3333    443
       ... 
5098      1
6254      1
1806      1
2292      1
3310      1
Name: bikeid, Length: 5783, dtype: int64

In [19]:
# Most popular starting stations
divvy_df['from_station_name'].value_counts()


Canal St & Adams St               19106
Streeter Dr & Grand Ave           18363
Clinton St & Madison St           17965
Clinton St & Washington Blvd      16068
Lake Shore Dr & North Blvd        13944
                                  ...  
South Chicago Ave & Elliot Ave        4
Racine Ave & 65th St                  3
Rhodes Ave & 71st St                  3
Racine Ave & 61st St                  1
Carpenter St & 63rd St                1
Name: from_station_name, Length: 612, dtype: int64

In [20]:
# Most popular ending stations

divvy_df['to_station_name'].value_counts()

Streeter Dr & Grand Ave         20143
Lake Shore Dr & North Blvd      18248
Clinton St & Washington Blvd    17347
Canal St & Adams St             16149
Clinton St & Madison St         15792
                                ...  
Rhodes Ave & 71st St                3
Michigan Ave & 71st St              3
Damen Ave & 59th St                 3
Racine Ave & 61st St                1
Vincennes Ave & 75th St             1
Name: to_station_name, Length: 612, dtype: int64

In [21]:
# Most popular trips *still in-progress*
popular_stations = divvy_df.groupby(['from_station_name','to_station_name'])
popular_stations

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd6caea5d00>

In [22]:
# Groupby usertype for ridership breakdown

usertype_df = divvy_df.groupby('usertype')
usertype_df.count()

Unnamed: 0_level_0,trip_id,bikeid,start_date,start_time,end_date,end_time,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,gender,birthyear,age,Age Group
usertype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Customer,214116,214116,214116,214116,214116,214116,214116,214116,214116,214116,214116,214116,214116,214116,214096
Subscriber,1139252,1139252,1139252,1139252,1139252,1139252,1139252,1139252,1139252,1139252,1139252,1139252,1139252,1139252,1138922


In [None]:
# Create calculation for average trip duration



In [None]:
# Summary statistics by each user type:
# - Gender %
# - Age groups and bins for histogram
# - Average trip duration 