In [1]:
import boto3
import io
import numpy as np
import pandas as pd

In [2]:
s3 = boto3.resource('s3')
bucket = s3.Bucket("cabi-tripdata")

df_rides = []
for f, obj in enumerate(bucket.objects.all()):
    body = obj.get()['Body'].read()
    temp = pd.read_csv(io.BytesIO(body), encoding='utf8')
    df_rides.append(temp)

#df_rides = pd.concat(df_rides, ignore_index=True)

In [3]:
df_rides_all = pd.concat(df_rides)

#### clean up, and engineer some logical and datetime features

In [4]:
time_cols = ['started_at', 'ended_at']
df_rides_all[time_cols] = df_rides_all[time_cols].apply(pd.to_datetime)

In [10]:
# remove some rides with missing end locations
missing_end_location = df_rides_all.loc[(df_rides_all['end_lat'].isna()) | (df_rides_all['end_lat']==0) | (df_rides_all['end_lng']==0)]
rides_clean = df_rides_all.drop(missing_end_location.index, axis=0)

# drop ride ID
rides_clean = rides_clean.drop('ride_id', axis=1)

rides_clean['is_ebike'] = rides_clean['rideable_type'] == 'electric_bike'
rides_clean['is_member'] = rides_clean['member_casual'] == 'member'
rides_clean['start_docked'] = ~rides_clean['start_station_id'].isna()
rides_clean['end_docked'] = ~rides_clean['end_station_id'].isna()

rides_clean.loc[:, 'start_hour'] = rides_clean.loc[:,'started_at'].dt.floor("H")
rides_clean.loc[:, 'end_hour'] = rides_clean.loc[:,'ended_at'].dt.floor("H")
rides_clean.loc[:, 'duration_mins'] = rides_clean['ended_at'] - rides_clean['started_at']
rides_clean.loc[:, 'duration_mins'] = np.round(rides_clean.loc[:, 'duration_mins'].dt.total_seconds()/60, 2)

In [12]:
rides_clean.to_csv("data/df_rides.csv", index=False)

In [13]:
# drop rides with missing start or end
no_stations = (rides_clean['start_station_id'].isna()) | (rides_clean['end_station_id'].isna())
rides_clean = rides_clean.loc[~no_stations, :]

train_cutoff = pd.Timestamp(2023, 6, 1)
rides_train = rides_clean.loc[(rides_clean['started_at'] <= train_cutoff)
                              & (rides_clean['ended_at'] <= train_cutoff), :]
rides_test = rides_clean.loc[rides_clean['started_at'] > train_cutoff, :]

# save train and test raw datasets
rides_train.to_csv("data/rides_train_clean.csv", index=False)
rides_test.to_csv("data/rides_test_clean.csv", index=False)

In [8]:
def haversine_distance(X, Y):
    """
    Calculate the Haversine distance between two points given their longitude and latitude coordinates.
    X and Y should be provided as nx2 arrays, with lon,lat columns
    """
    X = np.radians(X)
    Y = np.radians(Y)

    dlon = Y[:, 0] - X[:, 0]
    dlat = Y[:, 1] - X[:, 1]

    a = np.sin(dlat / 2) ** 2 + np.cos(X[:, 1]) * np.cos(Y[:, 1]) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    #distance = c / 6371  # Radius of the Earth in kilometers
    distance = c / 3958.8  # Radius of the Earth in miles

    return distance

# calculate the distance of each ride, in great-circle
start_locs = rides_clean[['start_lng', 'start_lat']].values
end_locs = rides_clean[['end_lng', 'end_lat']].values
rides_clean['ride_hav_dist_mi'] = haversine_distance(start_locs, end_locs)  # convert to miles

In [2]:
import boto3

s3 = boto3.client('s3')

local_file_path = 'data/rides_train.csv'
s3_bucket = "sagemaker-cabi-data"
s3_key = 'rides_train.csv'

# Upload the file to S3
s3.upload_file(Filename=local_file_path, Bucket=s3_bucket, Key=s3_key)

local_file_path = 'data/rides_test.csv'
s3_bucket = "cabi-train-test"
s3_key = 'rides_test.csv'

# Upload the file to S3
s3.upload_file(Filename=local_file_path, Bucket=s3_bucket, Key=s3_key)