In [2]:
# Import Dependencies
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn import datasets
import pandas as pd

In [3]:
# Import data file
divvy_df = pd.read_csv('Divvy_Trips_2019_Q4.csv')
divvy_df.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,25223640,2019-10-01 00:01:39,2019-10-01 00:17:20,2215,940.0,20,Sheffield Ave & Kingsbury St,309,Leavitt St & Armitage Ave,Subscriber,Male,1987.0
1,25223641,2019-10-01 00:02:16,2019-10-01 00:06:34,6328,258.0,19,Throop (Loomis) St & Taylor St,241,Morgan St & Polk St,Subscriber,Male,1998.0
2,25223642,2019-10-01 00:04:32,2019-10-01 00:18:43,3003,850.0,84,Milwaukee Ave & Grand Ave,199,Wabash Ave & Grand Ave,Subscriber,Female,1991.0
3,25223643,2019-10-01 00:04:32,2019-10-01 00:43:43,3275,2350.0,313,Lakeview Ave & Fullerton Pkwy,290,Kedzie Ave & Palmer Ct,Subscriber,Male,1990.0
4,25223644,2019-10-01 00:04:34,2019-10-01 00:35:42,5294,1867.0,210,Ashland Ave & Division St,382,Western Ave & Congress Pkwy,Subscriber,Male,1987.0


In [4]:
divvy_df.shape

(704054, 12)

In [5]:
# Remove null rows (if needed)
divvy_df.dropna()
divvy_df.shape

(704054, 12)

In [104]:
# Convert birthyear to age
divvy_df['age'] = (2021 - divvy_df['birthyear'])
divvy_df.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,age
0,25223640,2019-10-01 00:01:39,2019-10-01 00:17:20,2215,940.0,20,Sheffield Ave & Kingsbury St,309,Leavitt St & Armitage Ave,Subscriber,Male,1987.0,34.0
1,25223641,2019-10-01 00:02:16,2019-10-01 00:06:34,6328,258.0,19,Throop (Loomis) St & Taylor St,241,Morgan St & Polk St,Subscriber,Male,1998.0,23.0
2,25223642,2019-10-01 00:04:32,2019-10-01 00:18:43,3003,850.0,84,Milwaukee Ave & Grand Ave,199,Wabash Ave & Grand Ave,Subscriber,Female,1991.0,30.0
3,25223643,2019-10-01 00:04:32,2019-10-01 00:43:43,3275,2350.0,313,Lakeview Ave & Fullerton Pkwy,290,Kedzie Ave & Palmer Ct,Subscriber,Male,1990.0,31.0
4,25223644,2019-10-01 00:04:34,2019-10-01 00:35:42,5294,1867.0,210,Ashland Ave & Division St,382,Western Ave & Congress Pkwy,Subscriber,Male,1987.0,34.0


In [105]:
# Create bins for age
bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]

# Create the names for the five bins
group_names = ["11-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89"]
divvy_df["Age Group"] = pd.cut(divvy_df["age"], bins, labels=group_names, include_lowest=True)
divvy_df.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,age,Age Group
0,25223640,2019-10-01 00:01:39,2019-10-01 00:17:20,2215,940.0,20,Sheffield Ave & Kingsbury St,309,Leavitt St & Armitage Ave,Subscriber,Male,1987.0,34.0,30-39
1,25223641,2019-10-01 00:02:16,2019-10-01 00:06:34,6328,258.0,19,Throop (Loomis) St & Taylor St,241,Morgan St & Polk St,Subscriber,Male,1998.0,23.0,20-29
2,25223642,2019-10-01 00:04:32,2019-10-01 00:18:43,3003,850.0,84,Milwaukee Ave & Grand Ave,199,Wabash Ave & Grand Ave,Subscriber,Female,1991.0,30.0,20-29
3,25223643,2019-10-01 00:04:32,2019-10-01 00:43:43,3275,2350.0,313,Lakeview Ave & Fullerton Pkwy,290,Kedzie Ave & Palmer Ct,Subscriber,Male,1990.0,31.0,30-39
4,25223644,2019-10-01 00:04:34,2019-10-01 00:35:42,5294,1867.0,210,Ashland Ave & Division St,382,Western Ave & Congress Pkwy,Subscriber,Male,1987.0,34.0,30-39


In [94]:
#Review bins
divvy_df['Age Group'].value_counts()

30-39    243138
20-29    207020
40-49     93019
50-59     65825
60-69     26771
11-19      3945
70-79      2279
80-89        82
Name: Age Group, dtype: int64

In [106]:
# Plot bins as bar graph

In [96]:
# Convert date columns to datetime format
divvy_df['start_time'] = pd.to_datetime(divvy_df['start_time'])
divvy_df['end_time'] = pd.to_datetime(divvy_df['end_time'])

In [97]:
# Split date from time and create columns
divvy_df['start_date'] = divvy_df['start_time'].dt.date
divvy_df['start_time'] = divvy_df['start_time'].dt.time
divvy_df['end_date'] = divvy_df['end_time'].dt.date
divvy_df['end_time'] = divvy_df['end_time'].dt.time
divvy_df.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,age,Age Group,start_date,end_date
0,25223640,00:01:39,00:17:20,2215,940.0,20,Sheffield Ave & Kingsbury St,309,Leavitt St & Armitage Ave,Subscriber,Male,1987.0,34.0,30-39,2019-10-01,2019-10-01
1,25223641,00:02:16,00:06:34,6328,258.0,19,Throop (Loomis) St & Taylor St,241,Morgan St & Polk St,Subscriber,Male,1998.0,23.0,20-29,2019-10-01,2019-10-01
2,25223642,00:04:32,00:18:43,3003,850.0,84,Milwaukee Ave & Grand Ave,199,Wabash Ave & Grand Ave,Subscriber,Female,1991.0,30.0,20-29,2019-10-01,2019-10-01
3,25223643,00:04:32,00:43:43,3275,2350.0,313,Lakeview Ave & Fullerton Pkwy,290,Kedzie Ave & Palmer Ct,Subscriber,Male,1990.0,31.0,30-39,2019-10-01,2019-10-01
4,25223644,00:04:34,00:35:42,5294,1867.0,210,Ashland Ave & Division St,382,Western Ave & Congress Pkwy,Subscriber,Male,1987.0,34.0,30-39,2019-10-01,2019-10-01


In [99]:
# Organize columns in dataframe
divvy_df = divvy_df[["trip_id", "bikeid", "start_date", "start_time", "end_date", "end_time", "tripduration", "from_station_id", "from_station_name", "to_station_id", "to_station_name", "usertype", "gender", "birthyear", "age", "Age Group"]]
divvy_df.head()

Unnamed: 0,trip_id,bikeid,start_date,start_time,end_date,end_time,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear,age,Age Group
0,25223640,2215,2019-10-01,00:01:39,2019-10-01,00:17:20,940.0,20,Sheffield Ave & Kingsbury St,309,Leavitt St & Armitage Ave,Subscriber,Male,1987.0,34.0,30-39
1,25223641,6328,2019-10-01,00:02:16,2019-10-01,00:06:34,258.0,19,Throop (Loomis) St & Taylor St,241,Morgan St & Polk St,Subscriber,Male,1998.0,23.0,20-29
2,25223642,3003,2019-10-01,00:04:32,2019-10-01,00:18:43,850.0,84,Milwaukee Ave & Grand Ave,199,Wabash Ave & Grand Ave,Subscriber,Female,1991.0,30.0,20-29
3,25223643,3275,2019-10-01,00:04:32,2019-10-01,00:43:43,2350.0,313,Lakeview Ave & Fullerton Pkwy,290,Kedzie Ave & Palmer Ct,Subscriber,Male,1990.0,31.0,30-39
4,25223644,5294,2019-10-01,00:04:34,2019-10-01,00:35:42,1867.0,210,Ashland Ave & Division St,382,Western Ave & Congress Pkwy,Subscriber,Male,1987.0,34.0,30-39


In [50]:
# Number of unique bikes in circulation
divvy_df['bikeid'].nunique()

5670

In [9]:
# Bike IDs with most activity

divvy_df['bikeid'].value_counts()

5886    283
4832    273
1889    270
4848    267
1100    266
       ... 
805       1
3909      1
2985      1
6710      1
786       1
Name: bikeid, Length: 5670, dtype: int64

In [10]:
# Most popular starting stations
divvy_df['from_station_name'].value_counts()


Canal St & Adams St             12937
Clinton St & Madison St         10580
Clinton St & Washington Blvd     9834
Columbus Dr & Randolph St        7723
Kingsbury St & Kinzie St         7326
                                ...  
Wabash Ave & 83rd St                2
Elizabeth St & 59th St              1
Ashland Ave & 66th St               1
Seeley Ave & Garfield Blvd          1
Carpenter St & 63rd St              1
Name: from_station_name, Length: 610, dtype: int64

In [11]:
# Most popular ending stations

divvy_df['to_station_name'].value_counts()

Canal St & Adams St               12812
Clinton St & Washington Blvd      11051
Clinton St & Madison St           10360
Streeter Dr & Grand Ave            8820
Kingsbury St & Kinzie St           7172
                                  ...  
Wabash Ave & 83rd St                  3
South Chicago Ave & 83rd St           2
Seeley Ave & Garfield Blvd            2
Elizabeth St & 59th St                2
South Chicago Ave & Elliot Ave        1
Name: to_station_name, Length: 608, dtype: int64

In [62]:
# Most popular trips *still in-progress*
popular_stations = divvy_df.groupby(['from_station_name','to_station_name'])
popular_stations

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f8107291310>

In [30]:
# Groupby usertype for ridership breakdown

usertype_df = divvy_df.groupby('usertype')
usertype_df.count()

Unnamed: 0_level_0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,gender,birthyear
usertype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Customer,106194,106194,106194,106194,106194,106194,106194,106194,106194,44658,45265
Subscriber,597860,597860,597860,597860,597860,597860,597860,597860,597860,592805,597108


In [6]:
# Create calculation for average trip duration
divvy_df.index()


TypeError: 'RangeIndex' object is not callable

In [None]:
# Summary statistics by each user type:
# - Gender %
# - Age groups and bins for histogram
# - Average trip duration 