In [1]:
# Dependencies
import requests
import os
import json
import zipfile

import pandas as pd
import numpy as np

# Extract

## Station Information

In [2]:
# # Citi Bike publishes real-time system data in GBFS format.
# gbfs_response = requests.get('http://gbfs.citibikenyc.com/gbfs/gbfs.json').json()
# station_information_json = requests.get(gbfs_response['data']['en']['feeds'][1]['url']).json()

# # Export the station information as a json file to avoid calling real-time data everytime this notebook is run.
# # Station info reflects the latest info upon calling the station API.  
# with open(os.path.join('data', 'station_info.json'), 'w') as f:
#     json.dump(station_information_json['data']['stations'], f)

## Trip Data

In [3]:
# # Much like extraction of the station information, the following code needs to be executed only once to extract the trip data
# # of interest. The code has been written with downloading data for multiple months and combining them into one CSV file in mind.
# base_url = "https://s3.amazonaws.com/tripdata/"
# df_li = []

# # Limit the scope of the study to June 2022.
# year = '2022'
# # Loop through the period of interest by month. For a whole year of data, specify np.arange(1,13).
# for i in np.arange(6,7):
#     if i not in [6, 7]:
#         csv_name = f'{year}{i:02d}-citibike-tripdata.csv'
#     else:
#         # The file names for June and July of 2022 are different from those for the other months due to typo.
#         csv_name = f'{year}{i:02d}-citbike-tripdata.csv'
    
#     # The Citi Bike system data are stored as zip files.
#     zip_name = csv_name + '.zip'
#     zip_url = base_url + zip_name
    
#     # Request the zip file and extract its content.
#     zip_response = requests.get(zip_url)
#     with open(zip_name, 'wb') as f:
#         f.write(zip_response.content)
#     with zipfile.ZipFile(zip_name, 'r') as zip:
#         zip.extractall(year)
    
#     # Import the extract CSV file and create a DataFrame for it.
#     df = pd.read_csv(os.path.join(year, csv_name), dtype={'start_station_id': str, 'end_station_id': str})
#     # 'ride_id' can be dropped immediately as it is used to identify trips, as do the DataFrame indices.
#     df.drop(columns=['ride_id'], inplace=True)
    
#     df_li.append(df)
#     os.remove(zip_name)
#     os.remove(os.path.join(year, csv_name))
    
# # Concatenate all monthly data.
# df = pd.concat(df_li, axis=0, ignore_index=True)

# # Edited: import the additional JC (Jersey City) data for June 2022.
# # Upon close investigation of 202206-citibike-tripdata.csv, it is noted that a few trips that ended in JC are included in the
# # dataset. Because Citi Bike extends to Jersey City and Hoboken in New Jersey, the JC data, which share the same base URL with 
# # the NYC data, are included for the time period investigated.
# csv_name = f'JC-{year}06-citibike-tripdata.csv'
# zip_name = csv_name + '.zip'
# zip_response = requests.get(base_url + zip_name)

# with open(zip_name, 'wb') as f:
#     f.write(zip_response.content)
# with zipfile.ZipFile(zip_name, 'r') as zip:
#     zip.extractall(year)
    
# df_jc = pd.read_csv(os.path.join(year, csv_name), dtype={'start_station_id': str, 'end_station_id': str})
# df_jc.drop(columns=['ride_id'], inplace=True)

# os.remove(zip_name)
# os.remove(os.path.join(year, csv_name))

# df = pd.concat([df, df_jc], axis=0, ignore_index=True)
# df.to_csv(os.path.join(year, f'{year}06-citibike-tripdata.csv'), index=False)

# Transform

In [4]:
# Import the ridership CSV file.
df = pd.read_csv(os.path.join('2022', '202206-citibike-tripdata.csv'), dtype={'start_station_id': str, 'end_station_id': str})

## Trip Duration Outliers

In [5]:
df_1 = df.copy()

# Convert start time and end time to datetime objects to calculate trip duration.
df_1['started_at'] = pd.to_datetime(df_1['started_at'])
df_1['ended_at'] = pd.to_datetime(df_1['ended_at'])
df_1['trip_duration'] = (df_1['ended_at'] - df_1['started_at']).dt.total_seconds()

# The dataset is still relatively new, so it still needs to be processed to remove trips below 60 seconds in length as per
# https://citibikenyc.com/system-data. There are trips lasting days in the dataset, which are clearly outliers, so an upper
# limit of one day is set as well.
df_1 = df_1[(df_1['trip_duration']>=60) & (df_1['trip_duration']<=(60*60*24))]

# Trip duration has been calculated to remove outliers and will be dropped to reduce the notebook workload. (It will be computed
# again in Tableau.) The type of bikes used for each trip is outside the scope of the current study, so it will be dropped, too.
df_1.drop(columns=['rideable_type', 'trip_duration'], inplace=True)

# Rename 'member_casual' to 'user_type'.
df_1.rename(columns={'member_casual': 'user_type'}, inplace=True)

df_1.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3441935 entries, 0 to 3536181
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   started_at          3441935 non-null  datetime64[ns]
 1   ended_at            3441935 non-null  datetime64[ns]
 2   start_station_name  3441935 non-null  object        
 3   start_station_id    3441935 non-null  object        
 4   end_station_name    3439610 non-null  object        
 5   end_station_id      3439610 non-null  object        
 6   start_lat           3441935 non-null  float64       
 7   start_lng           3441935 non-null  float64       
 8   end_lat             3441654 non-null  float64       
 9   end_lng             3441654 non-null  float64       
 10  user_type           3441935 non-null  object        
dtypes: datetime64[ns](2), float64(4), object(5)
memory usage: 315.1+ MB


## Station Names and Ids
To ensure accurate groupby operations downstream, each station name should be related to its id one-to-one. However, for both start and end stations, the number of unique names is greater than the number of unique ids, indicating multiple station names referring to the same id and possibly vice versa.

### Start Stations

In [6]:
print(f"Number of unique start station names: {df_1['start_station_name'].nunique()}")
print(f"Number of unique start station ids: {df_1['start_station_id'].nunique()}")

Number of unique start station names: 1646
Number of unique start station ids: 1638


In [7]:
# Determine the start station ids that multiple station names refer to.
# If the station name is related to its id one-to-one, then df_1[['start_station_name', 'start_station_id']].drop_duplicates()
# should have the same index sequence as df_1['start_station_id'].drop_duplicates(). However, that is not the case: the 
# additional station names need to be addressed.
diff_id = df_1[['start_station_name', 'start_station_id']].drop_duplicates().index.difference(
    df_1['start_station_id'].drop_duplicates().index)
diff = df_1.loc[diff_id]['start_station_id'].sort_values()

# Print the start station ids with their conflicting names.
# repr() is used to prevent escape sequence interpretation.
for i in diff:
    print(i + ': ' + 
          ', '.join(repr(name) for name in df_1[df_1['start_station_id'] == i]['start_station_name'].unique()))

4488.09: 'Boerum Pl\\t& Pacific St', 'Boerum Pl\t& Pacific St'
4781.05: 'Nassau St\\t& Duffield St', 'Nassau St\t& Duffield St'
5323.06: 'Sharon St & Olive St', 'Sharon St & Olive St_new'
5329.08: 'Murray St\\t& West St', 'Murray St\t& West St'
5382.07: 'Forsyth St\t& Grand St', 'Forsyth St\\t& Grand St'
5883.06: 'Van Dam St & Greenpoint Ave', 'Van Dam St & Review Ave'
6300.04: 'Skillman Ave & 43 Ave', 'Skillman Ave & 32 Pl'
6535.04: 'W 34 St &\\tHudson Blvd E', 'W 34 St &\tHudson Blvd E'
6560.14: 'W 40 St & 7 Ave', 'W 40 St & 8 Ave'
6708.04: 'Broadway\\t& W 48 St', 'Broadway\t& W 48 St'


In [8]:
# For 6 of them, it is a simple matter of replacing '\\t' with '\t' (and removing '\t' altogether in station names), and the
# rest can be adjusted using the station information as reference.
df_1['start_station_name'] = df_1['start_station_name'].str.replace(r'\\t', r'\t', regex=True)
df_1['start_station_name'] = df_1['start_station_name'].str.replace(r'\t', ' ', regex=True)

In [9]:
# Import the station information and use it as a reference to resolve the remaining conflicting station names.
station_info = pd.read_json(os.path.join('data', 'station_info.json')).loc[:, ['short_name', 'name', 'lat', 'lon']]
station_info.rename(columns={'short_name':'station_id'}, inplace=True)

# Note that the list can be generated by the same code used to determine the start station ids that multiple station names 
# refer to.
for i in ['5323.06', '5883.06', '6300.04', '6560.14']:
    correct_name = station_info[station_info['station_id'] == i].name.str.cat()
    print(i + ': ' + correct_name)
    df_1.loc[df_1['start_station_id'] == i, 'start_station_name'] = correct_name

5323.06: Sharon St & Olive St
5883.06: Van Dam St & Greenpoint Ave
6300.04: Skillman Ave & 43 Ave
6560.14: W 40 St & 7 Ave


In [10]:
# Determine the start station names that multiple station ids refer to.
diff_id = df_1[['start_station_name', 'start_station_id']].drop_duplicates().index.difference(
    df_1['start_station_name'].drop_duplicates().index)
diff = df_1.loc[diff_id]['start_station_name'].sort_values()

# Print the start station names and their conflicting ids.
for name in diff:
    print(name + ': ' + ', '.join(i for i in df_1[df_1['start_station_name'] == name]['start_station_id'].unique()))

30 Ave & 12 St: 7034.08, 7034.09
Sharon St & Olive St: 5323.05, 5323.06


In [11]:
# Once again, use the station information to resolve the conflicting station ids.
for name in diff:
    correct_id = station_info[station_info['name'] == name]['station_id'].str.cat()
    print(name + ': ' + correct_id)
    df_1.loc[df_1['start_station_name'] == name, 'start_station_id'] = correct_id

30 Ave & 12 St: 7034.09
Sharon St & Olive St: 5323.06


In [12]:
print(f"Number of unique start station names: {df_1['start_station_name'].nunique()}")
print(f"Number of unique start station ids: {df_1['start_station_id'].nunique()}")

Number of unique start station names: 1636
Number of unique start station ids: 1636


### End Stations

In [13]:
print(f"Number of unique end station names: {df_1['end_station_name'].nunique()}")
print(f"Number of unique end station ids: {df_1['end_station_id'].nunique()}")

Number of unique end station names: 1650
Number of unique end station ids: 1642


In [14]:
# Determine the end station ids that multiple station names refer to.
diff_id = df_1[['end_station_name', 'end_station_id']].drop_duplicates().index.difference(
    df_1['end_station_id'].drop_duplicates().index)
diff = df_1.loc[diff_id]['end_station_id'].sort_values()

# Print the end station ids with their conflicting names.
for i in diff:
    print(i + ': ' + 
          ', '.join(repr(name) for name in df_1[df_1['end_station_id'] == i]['end_station_name'].unique()))

4488.09: 'Boerum Pl\\t& Pacific St', 'Boerum Pl\t& Pacific St'
4781.05: 'Nassau St\\t& Duffield St', 'Nassau St\t& Duffield St'
5323.06: 'Sharon St & Olive St', 'Sharon St & Olive St_new'
5329.08: 'Murray St\\t& West St', 'Murray St\t& West St'
5382.07: 'Forsyth St\\t& Grand St', 'Forsyth St\t& Grand St'
5883.06: 'Van Dam St & Review Ave', 'Van Dam St & Greenpoint Ave'
6300.04: 'Skillman Ave & 43 Ave', 'Skillman Ave & 32 Pl'
6535.04: 'W 34 St &\\tHudson Blvd E', 'W 34 St &\tHudson Blvd E'
6560.14: 'W 40 St & 7 Ave', 'W 40 St & 8 Ave'
6708.04: 'Broadway\\t& W 48 St', 'Broadway\t& W 48 St'


In [15]:
# The resulting end station ids are the same as the start station ids identified above, so the conflicting names can be resolved
# with the same approach.
df_1['end_station_name'] = df_1['end_station_name'].str.replace(r'\\t', r'\t', regex=True)
df_1['end_station_name'] = df_1['end_station_name'].str.replace(r'\t', ' ', regex=True)

for i in ['5323.06', '5883.06', '6300.04', '6560.14']:
    correct_name = station_info[station_info['station_id'] == i].name.str.cat()
    df_1.loc[df_1['end_station_id'] == i, 'end_station_name'] = correct_name

In [16]:
# Determine the end station names that multiple station ids refer to.
diff_id = df_1[['end_station_name', 'end_station_id']].drop_duplicates().index.difference(
    df_1['end_station_name'].drop_duplicates().index)
diff = df_1.loc[diff_id]['end_station_name'].sort_values()

# Print the end station names and their conflicting ids.
for name in diff:
    print(name + ': ' + ', '.join(i for i in df_1[df_1['end_station_name'] == name]['end_station_id'].unique()))

30 Ave & 12 St: 7034.09, 7034.08
Sharon St & Olive St: 5323.06, 5323.05


In [17]:
# The resulting end station names are the same as the start station names identified above, so the conflicting ids can be
# resolved with the same approach.
for name in diff:
    correct_id = station_info[station_info['name'] == name]['station_id'].str.cat()
    df_1.loc[df_1['end_station_name'] == name, 'end_station_id'] = correct_id

In [18]:
print(f"Number of unique end station names: {df_1['end_station_name'].nunique()}")
print(f"Number of unique end station ids: {df_1['end_station_id'].nunique()}")

Number of unique end station names: 1640
Number of unique end station ids: 1640


## Station Coordinates

In [19]:
df_2 = df_1.copy()

# Inspect the starting latitude and longitude for trips started from 5382.07.
df_2[df_2['start_station_id'] == '5382.07'][['start_lat', 'start_lng']].drop_duplicates()

Unnamed: 0,start_lat,start_lng
320718,40.717798,-73.993161
337173,40.717798,-73.993161
369955,40.717444,-73.993426
386524,40.717684,-73.993301
408448,40.717780,-73.993254
...,...,...
3266132,40.717781,-73.993242
3286564,40.717516,-73.993466
3395379,40.717567,-73.993388
3428005,40.717710,-73.993271


Multiple pairs of latitude and longitude are tied to the same station name, but they are in close proximity, indicating that they refer to the start coordinates of trips made from the same station. In order to map the incoming and outgoing traffic from bike stations, it is easier to work with a single set of coordinates for each station. To that end, merge the `df_1` and `station_info` DataFrames. The missing station coordinates can be imputed by averaging the trip coordinates associated with each station.

As proof of concept, compare the latitude and longitude of `5382.07` from the station information with the averaged trip coordinates tied to the station: they are similar down to the fourth decimal place, which implies accuracy of about 10 m in physical space.

In [20]:
station_info.loc[station_info['station_id'] == '5382.07'][['lat', 'lon']]

Unnamed: 0,lat,lon
1123,40.717798,-73.993161


In [21]:
df_2[df_2['start_station_id'] == '5382.07'][['start_lat', 'start_lng']].mean()

start_lat    40.717800
start_lng   -73.993165
dtype: float64

In [22]:
# Prepare start_stations DataFrame for merging.
start_stations = station_info.copy()
start_stations = start_stations.loc[:, ['station_id', 'lat', 'lon']]
start_stations.rename(columns={'station_id':'start_station_id',
                               'lat':'start_station_lat',
                               'lon': 'start_station_lng'}, inplace=True)

In [23]:
# Left outer merge start_startions into df_2.
df_2 = pd.merge(df_2, start_stations, on='start_station_id', how='left')

In [24]:
df_2.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3441935 entries, 0 to 3441934
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   started_at          3441935 non-null  datetime64[ns]
 1   ended_at            3441935 non-null  datetime64[ns]
 2   start_station_name  3441935 non-null  object        
 3   start_station_id    3441935 non-null  object        
 4   end_station_name    3439610 non-null  object        
 5   end_station_id      3439610 non-null  object        
 6   start_lat           3441935 non-null  float64       
 7   start_lng           3441935 non-null  float64       
 8   end_lat             3441654 non-null  float64       
 9   end_lng             3441654 non-null  float64       
 10  user_type           3441935 non-null  object        
 11  start_station_lat   3354913 non-null  float64       
 12  start_station_lng   3354913 non-null  float64       
dtypes: datetime6

In [25]:
# Fill in missing start station coordinates with the average trip start coordinates.
df_2['start_station_lat'].fillna(
    df_2[df_2['start_station_lat'].isna()].groupby('start_station_id')['start_lat'].transform('mean'), inplace=True)
df_2['start_station_lng'].fillna(
    df_2[df_2['start_station_lng'].isna()].groupby('start_station_id')['start_lng'].transform('mean'), inplace=True)

In [26]:
# Repeat for end stations.
end_stations = station_info.copy()
end_stations = end_stations.loc[:, ['station_id', 'lat', 'lon']]
end_stations.rename(columns={'station_id':'end_station_id', 
                             'lat':'end_station_lat', 
                             'lon': 'end_station_lng'}, inplace=True)

df_2 = pd.merge(df_2, end_stations, on='end_station_id', how='left')

df_2['end_station_lat'].fillna(
    df_2[df_2['end_station_lat'].isna()].groupby('end_station_id')['end_lat'].transform('mean'), inplace=True)
df_2['end_station_lng'].fillna(
    df_2[df_2['end_station_lng'].isna()].groupby('end_station_id')['end_lng'].transform('mean'), inplace=True)

In [27]:
df_2.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3441935 entries, 0 to 3441934
Data columns (total 15 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   started_at          3441935 non-null  datetime64[ns]
 1   ended_at            3441935 non-null  datetime64[ns]
 2   start_station_name  3441935 non-null  object        
 3   start_station_id    3441935 non-null  object        
 4   end_station_name    3439610 non-null  object        
 5   end_station_id      3439610 non-null  object        
 6   start_lat           3441935 non-null  float64       
 7   start_lng           3441935 non-null  float64       
 8   end_lat             3441654 non-null  float64       
 9   end_lng             3441654 non-null  float64       
 10  user_type           3441935 non-null  object        
 11  start_station_lat   3441935 non-null  float64       
 12  start_station_lng   3441935 non-null  float64       
 13  end_station_

In [28]:
# Drop the trip coordinates as the unique station coordinates have been determined.
df_2.drop(columns=['start_lat', 'start_lng', 'end_lat', 'end_lng'], inplace=True)

# Drop rows without end station info as the study focuses on trips that start and end at bike stations.
df_2.dropna(inplace=True)

# Reorganize the columns.
df_2 = df_2.iloc[:, [0,1,2,3,7,8,4,5,9,10,6]]

In [29]:
# Create a DataFrame that summarizes the station info as a result of data wrangling documented in Station Coordinates.
stations = df_2[['end_station_id', 'end_station_name', 'end_station_lat', 'end_station_lng']].drop_duplicates()
stations.rename(columns={'end_station_id': 'station_id',
                         'end_station_name': 'station_name',
                         'end_station_lat': 'station_lat',
                         'end_station_lng': 'station_lng'}, inplace=True)

In [30]:
df_2.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3439610 entries, 0 to 3441934
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   started_at          3439610 non-null  datetime64[ns]
 1   ended_at            3439610 non-null  datetime64[ns]
 2   start_station_name  3439610 non-null  object        
 3   start_station_id    3439610 non-null  object        
 4   start_station_lat   3439610 non-null  float64       
 5   start_station_lng   3439610 non-null  float64       
 6   end_station_name    3439610 non-null  object        
 7   end_station_id      3439610 non-null  object        
 8   end_station_lat     3439610 non-null  float64       
 9   end_station_lng     3439610 non-null  float64       
 10  user_type           3439610 non-null  object        
dtypes: datetime64[ns](2), float64(4), object(5)
memory usage: 314.9+ MB


## Net Traffic
The incoming and outgoing traffic from bike stations is of importance to monitor the balance of the system, so a separate DataFrame is created that summarizes the hourly traffic at each bike station for the time period of the study.

In [31]:
# # Check all start stations are in the list of end stations.
# start_stations = df_2['start_station_id'].unique().tolist()
# end_stations = df_2['start_station_id'].unique().tolist()
# all(station in end_stations for station in start_stations)

In [40]:
df_3 = df_2.copy()

# Add date and hour as additional features to group the bike station traffic by. Station id, day, hour, and user type together
# define the level of granularity at which traffic within the bike sharing system is investigated.
df_3 = df_3.assign(started_at_date=df_3['started_at'].dt.date, 
                   started_at_hour=df_3['started_at'].dt.hour,
                   ended_at_date=df_3['ended_at'].dt.date,
                   ended_at_hour=df_3['ended_at'].dt.hour)

In [41]:
# Compute the hourly outgoing traffic (i.e. number of trips started) at each station every day of the time period investigated,
# which is June 2022.
outgoing = df_3.groupby(['start_station_id', 'started_at_date', 'started_at_hour'])['started_at'].agg('count').reset_index()
outgoing.rename(columns={'start_station_id':'station_id',
                         'started_at_date':'date',
                         'started_at_hour':'hour',
                         'started_at':'outgoing_count'}, inplace=True)
outgoing.set_index(['station_id', 'date', 'hour'], inplace=True)

In [42]:
# Compute the hourly incoming traffic (i.e. number of trips ended) at each station every day of the time period investigated.
incoming = df_3.groupby(['end_station_id', 'ended_at_date', 'ended_at_hour'])['ended_at'].agg('count').reset_index()
incoming.rename(columns={'end_station_id':'station_id', 
                         'ended_at_date':'date', 
                         'ended_at_hour':'hour', 
                         'ended_at':'incoming_count'}, inplace=True)
incoming.set_index(['station_id', 'date', 'hour'], inplace=True)

In [43]:
# Compute the hourly net traffic at each station every day of the time period investigated. Negative net traffic means more
# bikes leaving than arriving within the hour. Depending on the station capacity, high negative net traffic can indicate need of
# supply.
net = (incoming['incoming_count']-outgoing['outgoing_count']) \
    .combine_first(incoming['incoming_count']).combine_first(-outgoing['outgoing_count']).astype('int64')
net.rename('net_traffic', inplace=True)

station_id  date        hour
2733.03     2022-06-01  1       1
                        6      -1
                        7       1
                        9       0
                        11      0
                               ..
SYS038      2022-06-24  23     -1
            2022-06-29  6       1
                        15     -1
                        19     -1
            2022-06-30  21     -1
Name: net_traffic, Length: 797776, dtype: int64

In [44]:
# Compute the hourly total traffic (i.e. sum of incoming and outgoing trips) at each station every day of the time period
# investigated, which is a measure of a station's popularity.
total = (incoming['incoming_count']+outgoing['outgoing_count']) \
    .combine_first(incoming['incoming_count']).combine_first(outgoing['outgoing_count']).astype('int64')
total.rename('total_traffic', inplace=True)

station_id  date        hour
2733.03     2022-06-01  1       1
                        6       1
                        7       1
                        9       2
                        11      2
                               ..
SYS038      2022-06-24  23      1
            2022-06-29  6       1
                        15      1
                        19      1
            2022-06-30  21      1
Name: total_traffic, Length: 797776, dtype: int64

In [46]:
# Merge the traffic info with the station info.
df_traffic = pd.concat([net, total], axis=1).reset_index().merge(stations)

In [47]:
df_traffic.head()

Unnamed: 0,station_id,date,hour,net_traffic,total_traffic,station_name,station_lat,station_lng
0,2733.03,2022-06-01,1,1,1,67 St & Erik Pl,40.633385,-74.016562
1,2733.03,2022-06-01,6,-1,1,67 St & Erik Pl,40.633385,-74.016562
2,2733.03,2022-06-01,7,1,1,67 St & Erik Pl,40.633385,-74.016562
3,2733.03,2022-06-01,9,0,2,67 St & Erik Pl,40.633385,-74.016562
4,2733.03,2022-06-01,11,0,2,67 St & Erik Pl,40.633385,-74.016562


In [48]:
# Reorganize the columns for export.
df_traffic = df_traffic.iloc[:, [0,5,6,7,1,2,3,4]]

# Load

In [38]:
# df_2[['started_at', 'ended_at', 'user_type']].to_csv(os.path.join('data', '202206-citibike-tripdata-cleaned.csv'), index=False)
# df_traffic.to_csv(os.path.join('data', '202206-citibike-citibike-station-traffic.csv'), index=False)