In [18]:
import boto3
import io
import numpy as np
import pandas as pd
import zipfile

Pull in the raw rides csvs for each month from S3 bucket, and concatenate all months into a single dataframe.

In [25]:
s3 = boto3.resource('s3')
bucket = s3.Bucket("capitalbikeshare-data")

objs = bucket.objects.filter(Prefix='202')

df_rides = []
for obj in objs:
    if int(obj.key[:6]) >= 202106:
        obj_bytes = obj.get()['Body'].read()
        with zipfile.ZipFile(io.BytesIO(obj_bytes), "r") as temp_zip:
            csv_filename = temp_zip.namelist()[0]
            with temp_zip.open(csv_filename) as csv_file:
                temp_df = pd.read_csv(csv_file, encoding='utf8')
                df_rides.append(temp_df)

In [31]:
df_rides_all = pd.concat(df_rides)

Clean up, and engineer some logical and datetime features

In [32]:
time_cols = ['started_at', 'ended_at']
df_rides_all[time_cols] = df_rides_all[time_cols].apply(pd.to_datetime)

In [33]:
# remove some rides with missing end locations
missing_end_location = df_rides_all.loc[(df_rides_all['end_lat'].isna()) | (df_rides_all['end_lat']==0) | (df_rides_all['end_lng']==0)]
rides_clean = df_rides_all.drop(missing_end_location.index, axis=0)

# drop ride ID
rides_clean = rides_clean.drop('ride_id', axis=1)

rides_clean['is_ebike'] = rides_clean['rideable_type'] == 'electric_bike'
rides_clean['is_member'] = rides_clean['member_casual'] == 'member'
rides_clean['start_docked'] = ~rides_clean['start_station_id'].isna()
rides_clean['end_docked'] = ~rides_clean['end_station_id'].isna()

rides_clean.loc[:, 'start_hour'] = rides_clean.loc[:,'started_at'].dt.floor("H")
rides_clean.loc[:, 'end_hour'] = rides_clean.loc[:, 'ended_at'].dt.floor("H")
rides_clean.loc[:, 'duration_mins'] = rides_clean['ended_at'] - rides_clean['started_at']
rides_clean.loc[:, 'duration_mins'] = np.round(rides_clean.loc[:, 'duration_mins'].dt.total_seconds()/60, 2)

In [34]:
rides_clean.to_csv("data/df_rides.csv", index=False)

Drop all rides with missing start or end location. These are a small fraction (5-10% of all rides)
Looking at the dataframe, it seems to be almost exclusively the e-bikes, because these are dockless. Would be interesting to look at in another context, but since the resolution of the start and end locations for these is clearly different (they are simply assigned to a grid location), it's easier to exclude them from this project.
Given the dockless nature, their rental patterns may be markedly different, and the operational requirements or priorities will also be different.

In [13]:
# drop rides with missing start or end
no_stations = (rides_clean['start_station_id'].isna()) | (rides_clean['end_station_id'].isna())
rides_clean = rides_clean.loc[~no_stations, :]

train_cutoff = pd.Timestamp(2023, 6, 1)
rides_train = rides_clean.loc[(rides_clean['started_at'] <= train_cutoff)
                              & (rides_clean['ended_at'] <= train_cutoff), :]
rides_test = rides_clean.loc[rides_clean['started_at'] > train_cutoff, :]

# save train and test raw datasets
rides_train.to_csv("data/rides_train_clean.csv", index=False)
rides_test.to_csv("data/rides_test_clean.csv", index=False)

In [8]:
def haversine_distance(X, Y):
    """
    Calculate the Haversine distance between two points given their longitude and latitude coordinates.
    X and Y should be provided as nx2 arrays, with lon,lat columns
    """
    X = np.radians(X)
    Y = np.radians(Y)

    dlon = Y[:, 0] - X[:, 0]
    dlat = Y[:, 1] - X[:, 1]

    a = np.sin(dlat / 2) ** 2 + np.cos(X[:, 1]) * np.cos(Y[:, 1]) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    #distance = c / 6371  # Radius of the Earth in kilometers
    distance = c / 3958.8  # Radius of the Earth in miles

    return distance

# calculate the distance of each ride, in great-circle
start_locs = rides_clean[['start_lng', 'start_lat']].values
end_locs = rides_clean[['end_lng', 'end_lat']].values
rides_clean['ride_hav_dist_mi'] = haversine_distance(start_locs, end_locs)  # convert to miles

Use boto3 to upload the train and test csv files to AWS S3 bucket

In [2]:
import boto3

s3 = boto3.client('s3')

local_file_path = 'data/rides_train.csv'
s3_bucket = "sagemaker-cabi-data"
s3_key = 'rides_train.csv'

# Upload the file to S3
s3.upload_file(Filename=local_file_path, Bucket=s3_bucket, Key=s3_key)

local_file_path = 'data/rides_test.csv'
s3_bucket = "cabi-train-test"
s3_key = 'rides_test.csv'

# Upload the file to S3
s3.upload_file(Filename=local_file_path, Bucket=s3_bucket, Key=s3_key)

In [1]:
import boto3

s3 = boto3.client('s3')

local_file_path = 'model_artefacts/darts_xgb_gzip.gz'
s3_bucket = "cabi-model-artefacts"
s3_key = 'darts_xgb_gzip.gz'

s3.upload_file(Filename=local_file_path, Bucket=s3_bucket, Key=s3_key)