In [1]:
import pandas as pd
import glob

# Assemble Full Year of Data

In [2]:
path = '../data/full-year'
all_files = glob.glob(path + "/*.csv")
all_dataframes = []

In [3]:
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    all_dataframes.append(df)

In [4]:
concatenated_df = pd.concat(all_dataframes, axis = 0, ignore_index=True)

In [5]:
concatenated_df.to_csv('../data/full-year.csv', index=False)

# Read Full Year

In [6]:
df = pd.read_csv("../data/full-year.csv")

In [7]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,CBCD0D7777F0E45F,classic_bike,2023-02-14 11:59:42,2023-02-14 12:13:38,Southport Ave & Clybourn Ave,TA1309000030,Clark St & Schiller St,TA1309000024,41.920771,-87.663712,41.907993,-87.631501,casual
1,F3EC5FCE5FF39DE9,electric_bike,2023-02-15 13:53:48,2023-02-15 13:59:08,Clarendon Ave & Gordon Ter,13379,Sheridan Rd & Lawrence Ave,TA1309000041,41.957879,-87.649584,41.969517,-87.654691,casual
2,E54C1F27FA9354FF,classic_bike,2023-02-19 11:10:57,2023-02-19 11:35:01,Southport Ave & Clybourn Ave,TA1309000030,Aberdeen St & Monroe St,13156,41.920771,-87.663712,41.880419,-87.655519,member
3,3D561E04F739CC45,electric_bike,2023-02-26 16:12:05,2023-02-26 16:39:55,Southport Ave & Clybourn Ave,TA1309000030,Franklin St & Adams St (Temp),TA1309000008,41.920873,-87.663733,41.879434,-87.635504,member
4,0CB4B4D53B2DBE05,electric_bike,2023-02-20 11:55:23,2023-02-20 12:05:48,Prairie Ave & Garfield Blvd,TA1307000160,Cottage Grove Ave & 63rd St,KA1503000054,41.794827,-87.618795,41.780531,-87.60597,member


In [8]:
len(df)

5674449

### Prep Data

In [9]:
# convert times to datetime
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

In [10]:
# parse date and time
df['start_date'] = df['started_at'].dt.strftime('%Y-%m-%d')
df['start_time'] = df['started_at'].dt.strftime('%H:%M')
df['year_month'] = df['started_at'].dt.strftime('%Y-%m')

In [11]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,year_month
0,CBCD0D7777F0E45F,classic_bike,2023-02-14 11:59:42,2023-02-14 12:13:38,Southport Ave & Clybourn Ave,TA1309000030,Clark St & Schiller St,TA1309000024,41.920771,-87.663712,41.907993,-87.631501,casual,2023-02-14,11:59,2023-02
1,F3EC5FCE5FF39DE9,electric_bike,2023-02-15 13:53:48,2023-02-15 13:59:08,Clarendon Ave & Gordon Ter,13379,Sheridan Rd & Lawrence Ave,TA1309000041,41.957879,-87.649584,41.969517,-87.654691,casual,2023-02-15,13:53,2023-02
2,E54C1F27FA9354FF,classic_bike,2023-02-19 11:10:57,2023-02-19 11:35:01,Southport Ave & Clybourn Ave,TA1309000030,Aberdeen St & Monroe St,13156,41.920771,-87.663712,41.880419,-87.655519,member,2023-02-19,11:10,2023-02
3,3D561E04F739CC45,electric_bike,2023-02-26 16:12:05,2023-02-26 16:39:55,Southport Ave & Clybourn Ave,TA1309000030,Franklin St & Adams St (Temp),TA1309000008,41.920873,-87.663733,41.879434,-87.635504,member,2023-02-26,16:12,2023-02
4,0CB4B4D53B2DBE05,electric_bike,2023-02-20 11:55:23,2023-02-20 12:05:48,Prairie Ave & Garfield Blvd,TA1307000160,Cottage Grove Ave & 63rd St,KA1503000054,41.794827,-87.618795,41.780531,-87.60597,member,2023-02-20,11:55,2023-02


In [12]:
df['is_january']=(df['start_date']>='2024-01-01') & (df['start_date']<='2024-01-31')

In [13]:
df['is_jan14_16']=(df['start_date']>='2024-01-14') & (df['start_date']<='2024-01-16')

In [14]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,year_month,is_january,is_jan14_16
0,CBCD0D7777F0E45F,classic_bike,2023-02-14 11:59:42,2023-02-14 12:13:38,Southport Ave & Clybourn Ave,TA1309000030,Clark St & Schiller St,TA1309000024,41.920771,-87.663712,41.907993,-87.631501,casual,2023-02-14,11:59,2023-02,False,False
1,F3EC5FCE5FF39DE9,electric_bike,2023-02-15 13:53:48,2023-02-15 13:59:08,Clarendon Ave & Gordon Ter,13379,Sheridan Rd & Lawrence Ave,TA1309000041,41.957879,-87.649584,41.969517,-87.654691,casual,2023-02-15,13:53,2023-02,False,False
2,E54C1F27FA9354FF,classic_bike,2023-02-19 11:10:57,2023-02-19 11:35:01,Southport Ave & Clybourn Ave,TA1309000030,Aberdeen St & Monroe St,13156,41.920771,-87.663712,41.880419,-87.655519,member,2023-02-19,11:10,2023-02,False,False
3,3D561E04F739CC45,electric_bike,2023-02-26 16:12:05,2023-02-26 16:39:55,Southport Ave & Clybourn Ave,TA1309000030,Franklin St & Adams St (Temp),TA1309000008,41.920873,-87.663733,41.879434,-87.635504,member,2023-02-26,16:12,2023-02,False,False
4,0CB4B4D53B2DBE05,electric_bike,2023-02-20 11:55:23,2023-02-20 12:05:48,Prairie Ave & Garfield Blvd,TA1307000160,Cottage Grove Ave & 63rd St,KA1503000054,41.794827,-87.618795,41.780531,-87.60597,member,2023-02-20,11:55,2023-02,False,False


# Export Dataset for Community Area Spatial Join
I'm exporting the entire dataset because everything here (5.7 million rides) has lat/lng and can be rolled up to community areas
to be geocoded in QGIS

In [None]:
### export dataset

In [18]:
df.to_csv("../data/divvy-rides-2023-02-to-2024-01.csv")

In [21]:
df_stations.to_csv("../data/divvy-stations-2023-02-to-2024-01.csv", index=False)

# ~review totals

### monthly

In [27]:
df_monthly = df.groupby('year_month').agg(
    rides_all=('ride_id', 'count'),
).reset_index()

In [28]:
df_monthly

Unnamed: 0,year_month,rides_all
0,2023-02,190445
1,2023-03,258678
2,2023-04,426590
3,2023-05,604827
4,2023-06,719618
5,2023-07,767650
6,2023-08,771693
7,2023-09,666371
8,2023-10,537113
9,2023-11,362518


In [30]:
df_monthly['rides_all'].sum()

5674449