In [2]:
import pandas as pd
from datetime import timedelta


#### Reading data 
First, I read data from 12 csv files, corresponding to data from 12 months <br>
I change 2 fields started_at and ended_at to their corresponding format - datetime <br>
And then combine them into 1 dataframe called df 

In [3]:
csv_files = ['202207-divvy-tripdata.csv', '202208-divvy-tripdata.csv','202209-divvy-publictripdata.csv', 
             '202210-divvy-tripdata.csv', '202211-divvy-tripdata.csv', '202212-divvy-tripdata.csv', 
             '202301-divvy-tripdata.csv', '202302-divvy-tripdata.csv', '202303-divvy-tripdata.csv', 
             '202304-divvy-tripdata.csv', '202305-divvy-tripdata.csv', '202306-divvy-tripdata.csv']

dfs = []
for filename in csv_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    df[['started_at', 'ended_at']] = df[['started_at', 'ended_at']].apply(pd.to_datetime)
    dfs.append(df)

df = pd.concat(dfs, axis=0, ignore_index=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5779379 entries, 0 to 5779378
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
dtypes: datetime64[ns](2), float64(4), object(7)
memory usage: 573.2+ MB
None


#### Processing
To process data, I delete rows that have NaN values, duplicated rows and rows that have started time equals or larger than ended time <br>


In [4]:
df = df.dropna()
df = df.drop_duplicates()
df = df.drop(df[df.started_at >= df.ended_at].index)
df['ride_length'] = df['ended_at'] - df['started_at']
df['start_weekday'] = df['started_at'].dt.day_name()

To remove outliers, I apply Empirical rule and drop rows that lay out 3 standard deviation range from mean value. 

In [5]:
standard_deviation = df['ride_length'].std()
mean_ride_len = df['ride_length'].mean()
upper_bound = mean_ride_len + standard_deviation * 3
lower_bound = mean_ride_len - standard_deviation * 3
print(standard_deviation)
print(upper_bound)
print(lower_bound)

0 days 00:36:59.850994962
0 days 02:07:04.000579893
-1 days +22:25:04.894610121


Since we can only use positive values for time, we don't use lower-bound to remove outliers

In [6]:
df = df.drop(df[df['ride_length'] > upper_bound].index)
print(df.head(5))
print(df.shape)

            ride_id rideable_type          started_at            ended_at  \
0  954144C2F67B1932  classic_bike 2022-07-05 08:12:47 2022-07-05 08:24:32   
1  292E027607D218B6  classic_bike 2022-07-26 12:53:38 2022-07-26 12:55:31   
2  57765852588AD6E0  classic_bike 2022-07-03 13:58:49 2022-07-03 14:06:32   
3  B5B6BE44314590E6  classic_bike 2022-07-31 17:44:21 2022-07-31 18:42:50   
4  A4C331F2A00E79E0  classic_bike 2022-07-13 19:49:06 2022-07-13 20:15:24   

           start_station_name start_station_id  \
0  Ashland Ave & Blackhawk St            13224   
1  Buckingham Fountain (Temp)            15541   
2  Buckingham Fountain (Temp)            15541   
3  Buckingham Fountain (Temp)            15541   
4      Wabash Ave & Grand Ave     TA1307000117   

                 end_station_name end_station_id  start_lat  start_lng  \
0        Kingsbury St & Kinzie St   KA1503000043  41.907066 -87.667252   
1           Michigan Ave & 8th St            623  41.869621 -87.623981   
2           Mi

#### Analyzing data
First, I divide df to 2 sub dataframe of member and casual customers to get some statistics like total rides, total ride length, average and standard deviation of ride time in each dataset. 

In [7]:
member_df = df.loc[df['member_casual'] == 'member']
casual_df = df.loc[df['member_casual'] == 'casual']
print(member_df.ride_length.describe())
print(casual_df.ride_length.describe())
print(df.ride_length.describe())
print('member:', member_df['ride_length'].dt.total_seconds().sum())
print('casual:', casual_df['ride_length'].dt.total_seconds().sum())

count                      2726023
mean     0 days 00:11:40.134662473
std      0 days 00:10:14.781596613
min                0 days 00:00:01
25%                0 days 00:05:01
50%                0 days 00:08:42
75%                0 days 00:14:53
max                0 days 02:07:01
Name: ride_length, dtype: object
count                      1650222
mean     0 days 00:19:14.679873374
std      0 days 00:19:31.489456068
min                0 days 00:00:01
25%                0 days 00:07:13
50%                0 days 00:12:42
75%                0 days 00:23:20
max                0 days 02:07:04
Name: ride_length, dtype: object
count                      4376245
mean     0 days 00:14:31.537430605
std      0 days 00:14:55.250770576
min                0 days 00:00:01
25%                0 days 00:05:44
50%                0 days 00:10:00
75%                0 days 00:17:40
max                0 days 02:07:04
Name: ride_length, dtype: object
member: 1908583193.0
casual: 1905478130.0


Then I create a dataframe which querry month, type of bike, type of member, and calculate average ride time and count of each observation.<br>
And then save it to a excel sheet. 

In [22]:
df['month'] = df.loc[:,'started_at'].dt.month
month_count= df.groupby(['month','rideable_type','member_casual']).size()
month_count = month_count.reset_index(name='count')
avg_ride_len = df.groupby(['month','rideable_type','member_casual']).ride_length.mean().dt.total_seconds()
avg_ride_len = avg_ride_len.reset_index(name='average ride length(s)')

month_df = pd.merge(month_count, avg_ride_len, on = ['member_casual', 'rideable_type', 'month'])
#month_df['average ride length'] = month_df['average ride length'].apply(lambda x: str(x).split('.')[0])
#month_df['average ride length'] = month_df['average ride length'].apply(lambda x: x.strftime('%H:%M:%S'))
print(month_df.head(5))
print(month_df.shape)
with pd.ExcelWriter('cyclistics_data.xlsx', mode = 'a', if_sheet_exists='replace') as writer:
    month_df.to_excel(writer, sheet_name='bike_type_month', index=False)

   month  rideable_type member_casual  count  average ride length(s)
0      1   classic_bike        casual  13684              789.100628
1      1   classic_bike        member  76267              602.018920
2      1    docked_bike        casual   1611             1345.608318
3      1  electric_bike        casual  14068              579.808573
4      1  electric_bike        member  42230              507.729600
(60, 5)


Then I create a dataframe which querry hour, weekday, type of member, and calculate average ride time and count of each observation.<br>
And then save the data to a excel sheet.

In [21]:
df['hour'] = df.loc[:,'started_at'].dt.hour
customer_hour = df.groupby(['hour', 'start_weekday','member_casual']).size()
customer_hour = customer_hour.reset_index(name='count')
customer_hour['hour'] = customer_hour['hour'].astype(int)
avg_ride_len = df.groupby(['hour', 'start_weekday','member_casual']).ride_length.mean().dt.total_seconds()
avg_ride_len = avg_ride_len.reset_index(name='average ride length(s)')

customer_hour = pd.merge(customer_hour, avg_ride_len, on = ['hour', 'start_weekday','member_casual'])

print(customer_hour.head(5))
print(customer_hour.shape)
with pd.ExcelWriter('cyclistics_data.xlsx', mode='a', if_sheet_exists='replace') as writer:
    customer_hour.to_excel(writer, sheet_name='hour_weekday', index=False)

   hour start_weekday member_casual  count  average ride length(s)
0     0        Friday        casual   3578              862.206260
1     0        Friday        member   3339              629.187781
2     0        Monday        casual   2369             1038.304348
3     0        Monday        member   2004              679.911178
4     0      Saturday        casual   8156              920.457700
(336, 5)


Then I find top 50 stations that have the most rides from casual customers, these stations may be good for promoting membership advertisement so casual customers can see membership upgrading plans and membership benefits.  

In [10]:
casual_station_count = casual_df.loc[:,'start_station_name'].value_counts()
casual_station_count = casual_station_count.nlargest(10)
casual_station_count = casual_station_count.reset_index(name='count')
casual_station_count = casual_station_count.rename(columns={'index': 'station'})

print(casual_station_count)
with pd.ExcelWriter('cyclistics_data.xlsx', mode='a', if_sheet_exists='replace') as writer:
    casual_station_count.to_excel(writer, sheet_name='top_casual_start_station', index=False)

                   start_station_name  count
0             Streeter Dr & Grand Ave  48615
1   DuSable Lake Shore Dr & Monroe St  28496
2               Michigan Ave & Oak St  21820
3                     Millennium Park  21139
4  DuSable Lake Shore Dr & North Blvd  19964
5                      Shedd Aquarium  18247
6                 Theater on the Lake  15830
7               Wells St & Concord Ln  13405
8                      Dusable Harbor  12904
9          Indiana Ave & Roosevelt Rd  11692
