In [1]:
from shapely.geometry import Point, LineString, Polygon
from shapely import wkt
import pandas as pd
import geopandas as gpd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster
from sqlalchemy import create_engine, text
import time

In [2]:
# create postgresql connection
database_name = 'scooters'
connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"

engine = create_engine(connection_string)

In [3]:
# start timestamp
start = time.time()

# get data
query = '''
    SELECT	pubtimestamp
            ,companyname AS company
            ,sumdid
            ,tripduration
            ,tripdistance
            ,startdate
            ,enddate
            ,startlatitude
            ,startlongitude
            ,endlatitude
            ,endlongitude
            ,triproute
    FROM	trips
'''

with engine.connect() as connection:    
    trips_sql = pd.read_sql(text(query), con = connection)

# end timestamp
end = time.time()
# execution time
total_time = end - start
print("\n"+ str(total_time))


34.11723875999451


In [4]:
# create df
trips = trips_sql

In [5]:
# start timestamp
start = time.time()

# create empty dataframe
start_to_end = pd.DataFrame()

# chage the format to convert df to gdf
start_to_end['start_geometry'] = trips.loc[:, ['startlatitude', 'startlongitude']]\
    .apply(lambda x: Point((float(x.startlongitude), float(x.startlatitude))), axis=1)
start_to_end['end_geometry'] = trips.loc[:, ['endlatitude', 'endlongitude']]\
    .apply(lambda x: Point((float(x.endlongitude), float(x.endlatitude))), axis=1)

# end timestamp
end = time.time()
# execution time check
total_time = end - start
print("\n"+ str(total_time))


148.88873672485352


In [6]:
# add index number to df as identifier
start_to_end.reset_index(inplace = True)

In [7]:
# convert trip route(start-end only) to geospatial data
start_to_end = gpd.GeoDataFrame(start_to_end, crs = 4326, geometry = start_to_end['start_geometry'])

In [8]:
# get bus stops data
bus_stops = pd.read_csv('../data/Regional_Transportation_Authority_Bus_Stops.csv')

In [9]:
# df cleanup
bus_stops[['lat', 'lng']] = bus_stops['Mapped Location']\
    .map(lambda x : re.sub('\(', '', x))\
    .map(lambda x : re.sub('\,', '', x))\
    .map(lambda x : re.sub('\)', '', x))\
    .str.split(' ', expand = True)

In [10]:
# chage the format to convert to gdf
bus_stops['geometry'] = bus_stops.apply(lambda x: Point((float(x.lng), 
                                                         float(x.lat))), 
                                        axis=1)
bus_stops.drop(columns = ['Mapped Location', 'lat', 'lng'], inplace = True)

In [11]:
# convert trip route(start-end only) to geospatial data
bus_stops = gpd.GeoDataFrame(bus_stops, crs = 4326, geometry = bus_stops['geometry'])

In [12]:
# 1 mile = 1 lat/long
buffer_in_meters = 1*(1/60)

# add buffer to bus stop == change POINT geometry to POLYGON
bus_stops['geometry'] = bus_stops['geometry'].buffer(buffer_in_meters)
bus_stops = bus_stops[['Stop ID Number', 'geometry']]


  bus_stops['geometry'] = bus_stops['geometry'].buffer(buffer_in_meters)


In [13]:
# start timestamp
start = time.time()

# geospatial join 
last_mile = gpd.sjoin(start_to_end, bus_stops, op = 'within', how = 'left')
last_mile

end = time.time()

# end timestamp
total_time = end - start
print("\n"+ str(total_time))


333.6524748802185


In [53]:
# start timestamp
start = time.time()

# count of last mile trip = numbers of trip start within 1 mile from bus stop
yes_last_miles = last_mile.loc[last_mile['index_right'].notnull()].groupby(['index']).count()

# count of NOT_last_mile trip = numbers of trip start within 1 mile from bus stop
not_last_mile_list = last_mile.loc[last_mile['index_right'].isna()]

# end timestamp
end = time.time()
# execution time check
total_time = end - start
print("\n"+ str(total_time))


12.346878290176392


In [49]:
# numbers of trip that has no bus stop within 1 miles
not_last_mile_list.drop(columns = ['start_geometry', 'end_geometry', 'geometry', 'index_right', 'Stop ID Number'], inplace = True)
not_last_mile_list.rename(columns = {'index' : 'trip_no'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_last_mile_list.drop(columns = ['start_geometry', 'end_geometry', 'geometry', 'index_right', 'Stop ID Number'], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_last_mile_list.rename(columns = {'index' : 'trip_no'}, inplace = True)


In [50]:
not_last_mile_list

Unnamed: 0,trip_no
15,15
43,43
61,61
75,75
91,91
...,...
565505,565505
565506,565506
565507,565507
565508,565508


In [55]:
# trips start within 1 mile from bus stop and numbers of bus stop within a miles
yes_last_miles.reset_index(inplace = True, drop = True)
yes_last_miles.drop(columns = ['start_geometry', 'end_geometry', 'geometry'], inplace = True)
yes_last_miles.rename(columns = {'index_right' : 'trip_no', 'Stop ID Number' : 'numbers_of_bus_stops_within_1miles'}, inplace = True)

yes_last_miles

Unnamed: 0,trip_no,numbers_of_bus_stops_within_1miles
0,16,16
1,26,26
2,26,26
3,8,8
4,18,18
...,...,...
509648,32,32
509649,27,27
509650,36,36
509651,22,22
