Bachelor thesis - Richard Borschke - 7337876 - University of Cologne - A spatio-temporal analysis of usage patterns in free-floating shared mobility


# Data Supplementation

This notebook contains the supplementation of data, needed for analysis. Following steps are performed:
* Data loading of prepared trip files
* Assignment of time bucket of trip
* Calculating distance to city center
* POI type supplementation (WHICH TYPES??)


time (18.11 to 29.2  +  05.11.) --> 5.11. excluded

data set misses 2 days (16. and 15. 01.)

car misses 4 days (06. to 09.12.)

### Imports

In [1]:
import pandas as pd
import glob
from haversine import haversine 

import numpy as np
from sklearn.neighbors import BallTree

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import osmnx as ox
ox.config(log_console=True, use_cache=True)
ox.__version__

#import seaborn as sns

'0.16.2'

### Load Data

In [2]:
# load monthly data of mode based on selected months
def load_data(mode, months):
    first_read = True
    for month in months:
        data_month = pd.concat([pd.read_csv(file) for file in glob.glob('Data/Modes/data_prepared_{}_{}.csv'.format(mode, month))], ignore_index = True)
        if (first_read):
            data_full = data_month
            first_read = False
        else:
            data_full = pd.concat([data_full, data_month], ignore_index=True)
    return data_full

# load full data of mode
def load_data_mode_full(mode):
    data_mode_full = pd.read_csv('Data/Modes/data_prepared_full_{}.csv'.format(mode))
    return data_mode_full

In [3]:
# define months for data loading (YYYYMM)
months = ['201911', '201912', '202001', '202002']
#load data by months
load_monthly_data = False
if (load_monthly_data):
    data_car = load_data('car', months)
    data_bicycle = load_data('bicycle', months)
    data_scooter = load_data('scooter', months)
    data_full = pd.concat([data_car, data_bicycle, data_scooter], ignore_index=True)

In [4]:
# read in desired data
# set True if full data set load is desired
load_full_data = True
if (load_full_data):
    data_car = load_data_mode_full('car')
    data_bicycle = load_data_mode_full('bicycle')
    data_scooter = load_data_mode_full('scooter')
    data_full = pd.concat([data_car, data_bicycle, data_scooter], ignore_index=True)

### Time Bucket Assignment

### Calculating Distance to City Center

* Distance to city center from trip start
* Distance to city center from trip end

In [5]:
# calculate haversine distance to city center in m
def calculate_distance_to_city_center(data_full):
    data_full_d = data_full.copy()
    data_full_d['dist_center_start'] = data_full_d.apply(lambda r: int(haversine((r['latitude_start'], r['longitude_start']), (50.941724380890186, 6.958446824087053))*1000), axis=1) #convert Km to meter
    data_full_d['dist_center_end'] = data_full_d.apply(lambda r: int(haversine((r['latitude_end'], r['longitude_end']), (50.941724380890186, 6.958446824087053))*1000), axis=1) #convert Km to meter
    return data_full_d   

In [6]:
# calculate distance to city center (Dom) for both origin and destination
data_full = calculate_distance_to_city_center(data_full)

In [7]:
data_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410295 entries, 0 to 410294
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 410295 non-null  object 
 1   provider           410295 non-null  object 
 2   vehicleType        410295 non-null  object 
 3   date_start         410295 non-null  object 
 4   time_start         410295 non-null  int64  
 5   date_end           410295 non-null  object 
 6   time_end           410295 non-null  int64  
 7   year               410295 non-null  int64  
 8   month              410295 non-null  int64  
 9   weekday            410295 non-null  int64  
 10  hour               410295 non-null  int64  
 11  weekend            410295 non-null  int64  
 12  longitude_start    410295 non-null  float64
 13  latitude_start     410295 non-null  float64
 14  longitude_end      410295 non-null  float64
 15  latitude_end       410295 non-null  float64
 16  co

### POI Type Supplementation

* Aeroway
* Arts, culture and entertainment
* Education
* Finance
* Food and drink
* Healthcare
* History
* Leisure
* Office
* Sport
* Shop
* Tourism
* Transporation

In [19]:
# prepare POI data for assignment to trip data
def prepare_POI_data(POI_data):
    # get names of indexes for which column element_type does not equal node
    indexNames = POI_data[ POI_data['element_type'] != 'node' ].index
    
    # delete these row indexes from dataFrame
    POI_data.drop(indexNames , inplace=True)
    
    # drop all columns except certain ones
    POI_data.drop(POI_data.columns.difference(['unique_id','osmid','geometry','name','amenity']), 1, inplace=True)
    
    # create longitude and latitude columns
    POI_data['longitude'] = POI_data.geometry.x
    POI_data['latitude'] = POI_data.geometry.y     
    
    # create usage count column (check how often and if POI is used as neighbor of trip origin or destination)
    POI_data['usage_count'] = 0
    
    return POI_data

# supplement data with POI types
def supplement_trip_data_with_POI_data(data_trips, trips_start_radians, trips_end_radians, POI_data, POI_name):    
    # extract lat/long pairs as numpy array for POI data
    POI_gps = POI_data[["latitude", "longitude"]].values

    # create the ball tree with haversine metric
    POI_radians = np.radians(POI_gps)
    tree = BallTree(POI_radians, metric='haversine')

    # all POIs within a radius of 300 meters
    distance_in_meters = 300
    
    # approx. mean radius
    earth_radius_in_meters = 6371000
    radius = distance_in_meters / earth_radius_in_meters

    # apply query_radius()
    # distances are the great circle distance on the unit sphere
    is_within_start, distances_start = tree.query_radius(trips_start_radians, r=radius, count_only=False, return_distance=True) 
    is_within_end, distances_end = tree.query_radius(trips_end_radians, r=radius, count_only=False, return_distance=True) 

    # convert distances back to meters
    distances_in_meters_start = distances_start * earth_radius_in_meters
    distances_in_meters_end = distances_end * earth_radius_in_meters
    
    # transform to count value with list comprehension
    distances_in_meters_start[:] = [len(array) for array in distances_in_meters_start]
    distances_in_meters_end[:] = [len(array) for array in distances_in_meters_end]

    #create count column for POI for trip data start and end
    data_trips['{}_start'.format(POI_name)] = distances_in_meters_start
    data_trips['{}_end'.format(POI_name)] = distances_in_meters_end

    # calculate usage count for POI data
    is_within = np.concatenate((is_within_start, is_within_end), axis=0)
    for array in is_within:
        for element in array:
            POI_data.loc[element, 'usage_count'] = POI_data.loc[element, 'usage_count'] + 1
    
    # add identifier column
    POI_data['POI_type'] = POI_name
    
    return data_trips, POI_data

In [20]:
# extract lat/long pairs as numpy array for trip data start and end
trips_start_gps = data_full[["latitude_start", "longitude_start"]].values
trips_end_gps = data_full[["latitude_end", "longitude_end"]].values

# transform lat/long pairs to radians
trips_start_radians = np.radians(trips_start_gps)
trips_end_radians = np.radians(trips_end_gps)

In [21]:
# define place for data mining
place = 'Köln, Germany'

In [24]:
import time

start = time.time()

# aeroway POI for transportation but different radius for assignment (handeled seperately)
tags={'aeroway': 'terminal'}
# retrieving openstreetmap data and preparing data
POI_aero = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_aero = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_aero, 'POI_aero')  

end = time.time()
print(end - start)

POI_aero

  and should_run_async(code)


4.255924940109253


Unnamed: 0,unique_id,osmid,name,geometry,longitude,latitude,usage_count,POI_type
0,node/27296045,27296045,Terminal 2 Fluggastbereich D,POINT (7.11970 50.88054),7.119703,50.880543,3477,POI_aero


In [12]:
# arts, culture and entertainment POI
tags = {'amenity': ['arts_centre','cinema', 'brothel', 'casino', 'community_centre', 'gambling', 'love_hotel', 'nightclub', 'planetarium', 'public_bookcase', 'social_centre', 'stripclub', 'studio', 'swingerclub', 'theatre']}
POI_art_culture_entertainment = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_art_culture_entertainment = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_art_culture_entertainment, 'POI_art_culture_entertainment')  
POI_art_culture_entertainment

  and should_run_async(code)


Unnamed: 0,unique_id,osmid,amenity,name,geometry,longitude,latitude,usage_count
0,node/54020150,54020150,theatre,Atelier Theater,POINT (6.93548 50.93259),6.935482,50.932594,11208
1,node/247382429,247382429,planetarium,Planetarium und Sternwarte Köln,POINT (6.95649 50.96660),6.956487,50.966600,4982
2,node/256221801,256221801,cinema,Metropolis,POINT (6.95821 50.95108),6.958213,50.951083,9204
3,node/257905592,257905592,theatre,Gloria Theater,POINT (6.94495 50.93749),6.944948,50.937488,15291
4,node/258183467,258183467,community_centre,Don-Bosco-Club Köln-Mülheim,POINT (7.01081 50.97438),7.010811,50.974384,1811
...,...,...,...,...,...,...,...,...
200,node/7704871144,7704871144,public_bookcase,,POINT (6.90917 50.94801),6.909171,50.948012,4856
201,node/7812771513,7812771513,gambling,Lido Spielhalle,POINT (7.06945 50.97841),7.069447,50.978415,0
202,node/7861149238,7861149238,studio,Filmproduktion Peter Schüttemeyer,POINT (6.93241 50.94004),6.932407,50.940044,7911
203,node/8097066616,8097066616,community_centre,Bürgerzentrum Nippes - Turmstraße,POINT (6.95050 50.96453),6.950498,50.964528,6059


In [13]:
# education POI
tags = {'amenity': ['college','driving_school', 'kindergarten', 'language_school', 'library', 'music_school', 'school', 'university']}
POI_education = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_education = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_education, 'POI_education')  
POI_education

  and should_run_async(code)


Unnamed: 0,unique_id,osmid,geometry,amenity,name,longitude,latitude,usage_count
0,node/75874741,75874741,POINT (6.92635 50.95670),school,Lauder-Morijah-Grundschule,6.926346,50.956699,3002
1,node/160695366,160695366,POINT (6.95199 50.94475),library,Erzbischöfliche Diözesan- und Dombibliothek,6.951991,50.944755,6296
2,node/215423659,215423659,POINT (6.90924 50.96505),school,Montessori-Grundschule Ossendorf,6.909241,50.965055,1703
3,node/221175135,221175135,POINT (6.89374 50.97834),school,Bildungszentrum Butzweiler Hof,6.893744,50.978343,419
4,node/223292416,223292416,POINT (6.92708 50.97106),kindergarten,,6.927081,50.971057,1292
...,...,...,...,...,...,...,...,...
303,node/8126692679,8126692679,POINT (6.87622 50.94512),school,Anna-Freud-Förderschule,6.876224,50.945117,505
304,node/8156001437,8156001437,POINT (6.94488 50.93418),music_school,drummer's focus,6.944881,50.934182,10695
305,node/8188016679,8188016679,POINT (6.88193 50.95509),kindergarten,Kindergruppe Sonnenstrahlen e.V.,6.881928,50.955092,232
306,node/8190632569,8190632569,POINT (6.94920 50.93439),library,Stadtbibliothek Köln,6.949196,50.934391,11935


In [14]:
data_full.info()
len(data_full[data_full['POI_education_start']==0])

  and should_run_async(code)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410295 entries, 0 to 410294
Data columns (total 29 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   410295 non-null  object 
 1   provider             410295 non-null  object 
 2   vehicleType          410295 non-null  object 
 3   date_start           410295 non-null  object 
 4   time_start           410295 non-null  int64  
 5   date_end             410295 non-null  object 
 6   time_end             410295 non-null  int64  
 7   year                 410295 non-null  int64  
 8   month                410295 non-null  int64  
 9   weekday              410295 non-null  int64  
 10  hour                 410295 non-null  int64  
 11  weekend              410295 non-null  int64  
 12  longitude_start      410295 non-null  float64
 13  latitude_start       410295 non-null  float64
 14  longitude_end        410295 non-null  float64
 15  latitude_end     

144294

In [15]:
# finance POI
tags = {'amenity': ['atm','bank', 'bureau_de_change']}
POI_finance = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_finance = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_finance, 'POI_finance')  
POI_finance



Unnamed: 0,unique_id,osmid,amenity,name,geometry,longitude,latitude,usage_count
0,node/54749669,54749669,bank,Sparkasse KölnBonn,POINT (6.94095 50.93541),6.940948,50.935411,17851
1,node/196170706,196170706,bank,Sparkasse,POINT (6.91989 50.95370),6.919886,50.953704,5935
2,node/224445220,224445220,bank,Sparkasse,POINT (6.90095 50.95752),6.900947,50.957519,3505
3,node/230226555,230226555,bank,Sparkasse,POINT (6.89975 50.96789),6.899752,50.967888,1025
4,node/232284761,232284761,atm,Sparkasse KölnBonn,POINT (6.95012 50.96001),6.950118,50.960011,4396
...,...,...,...,...,...,...,...,...
358,node/8131880605,8131880605,atm,,POINT (7.00520 50.96093),7.005200,50.960926,2267
359,node/8132374205,8132374205,atm,,POINT (6.93897 50.93838),6.938973,50.938383,20195
360,node/8132374206,8132374206,atm,,POINT (6.93907 50.93876),6.939065,50.938760,20152
361,node/8132435066,8132435066,atm,,POINT (6.94059 50.93533),6.940585,50.935334,18215


In [16]:
# food and drink POI
tags = {'amenity': ['bar','biergarten','cafe','drinking_water','fast_food','food_court','ice_cream','pub','restaurant', 'internet_cafe']}
POI_food_drink = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_food_drink = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_food_drink, 'POI_food_drink')  
POI_food_drink

  and should_run_async(code)


KeyboardInterrupt: 

In [None]:
# healthcare POI
tags = {'amenity': ['clinic','dentist', 'doctors', 'hospital', 'nursing_home', 'pharmacy', 'social_facility', 'veterinary'],
       'emergency': ['emergency_ward_entrance']}
POI_healthcare = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_healthcare = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_healthcare, 'POI_healthcare')  
POI_healthcare

In [None]:
# history POI
tags = {'historic': True}
POI_history = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_history = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_history, 'POI_history')  
POI_history

In [None]:
# leisure POI
tags = {'leisure': True}
POI_leisure = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_leisure = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_leisure, 'POI_leisure')  
POI_leisure

In [None]:
# office POI
tags = {'office': True}
POI_office = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_office = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_office, 'POI_office')  
POI_office

In [None]:
# sport POI
tags = {'sport': True}
POI_sport = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_sport = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_sport, 'POI_sport')  
POI_sport

In [None]:
# shop POI
tags = {'shop': True}
POI_shop = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_shop = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_shop, 'POI_shop')  
POI_shop

In [None]:
# tourism POI
tags = {'tourism': True}
POI_tourism = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_tourism = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_tourism, 'POI_tourism')  
POI_tourism

In [None]:
# transportation POI
tags = {'amenity': ['bicycle_rental','boat_rental', 'boat_sharing', 'bus_station', 'car_rental', 'car_sharing', 'ferry_terminal', 'taxi'],
       'public_transport':['station'],
       'railway':['platform', 'station', 'tram_stop', 'subway_entrance']}
POI_transport = prepare_POI_data(ox.pois.pois_from_place(place=place, tags=tags))
# supplement trip data with POI data selection
data_full, POI_transport = supplement_trip_data_with_POI_data(data_full, trips_start_radians, trips_end_radians, POI_transport, 'POI_transport')  
POI_transport

In [None]:
# show supplemented trip data
pd.options.display.max_columns = None
display(data_full)

### Save data sets

In [27]:
# save data by mode and months
def save_supplemented_data(months, data_supplemented):
    for month in months:
        # extract month, year and mode
        only_month = int(month[4:])
        only_year = int(month[:4])
        mode_name_file = data_supplemented['vehicleType'].iloc[0]
        # create dataframe and save it as csv file
        split_cond = [(data_supplemented['month'] == only_month) & (data_supplemented['year'] == only_year)]
        data_supplemented[split_cond[0]].to_csv('Data/Modes/data_supplemented_{}_{}.csv'.format(mode_name_file, month), index = False)

# save data by mode only
def save_supplemented_data_full(data_supplemented):
    # create dataframe and save it as csv file
    data_supplemented[data_supplemented['vehicleType']=='car'].to_csv('Data/Modes/data_supplemented_full_car.csv', index = False)
    data_supplemented[data_supplemented['vehicleType']=='bicycle'].to_csv('Data/Modes/data_supplemented_full_bicycle.csv', index = False)
    data_supplemented[data_supplemented['vehicleType']=='scooter'].to_csv('Data/Modes/data_supplemented_full_scooter.csv', index = False)

In [28]:
# save POI datasets
# set True if saving is desired
save_data_POI = False
if (save_data_POI):
    POI_data = pd.concat([POI_aero, POI_art_culture_entertainment, POI_education, POI_finance, 
                          POI_food_drink, POI_healthcare, POI_history, POI_leisure, POI_office,
                          POI_sport, POI_shop, POI_tourism, POI_transporation], ignore_index=True)
    POI_data.to_csv('Data/Modes/POI_data.csv', index = False)

In [29]:
# save supplemented data sets based on selected months
# split data by month to avoid file size > 100 MB due to GitHub rules
# set True if saving is desired
save_data = False
if (save_data):
    save_supplemented_data(months, data_full[data_full['vehicleType']=='car'])
    save_supplemented_data(months, data_full[data_full['vehicleType']=='bicycle'])
    save_supplemented_data(months, data_full[data_full['vehicleType']=='scooter'])

In [30]:
# save supplemented data sets in one file per mode
# set True if saving is desired
save_data_full = False
if (save_data_full):
    save_supplemented_data_full(data_full)

### Descriptive statistics

In [None]:
POI_data.isnull().values.any()

In [None]:
data_full.isnull().values.any()

In [None]:
len(data_car[(data_car['weekday'] == 0) & (data_car['hour'] == 2)])