Bachelor thesis - Richard Borschke - 7337876 - University of Cologne - A spatio-temporal analysis of usage patterns in free-floating shared mobility


# Data Supplementation

This notebook contains the supplementation of data, needed for analysis. Following steps are performed:
* Data loading of prepared trip files
* Assignment of time bucket of trip
* Calculating distance to city center
* POI type supplementation (WHICH TYPES??)


time (18.11 to 29.2  +  05.11.) --> 5.11. excluded

data set misses 2 days (16. and 15. 01.)

car misses 4 days (06. to 09.12.)

### Imports

In [1]:
import pandas as pd
import glob
from haversine import haversine 

import osmnx as ox

import numpy as np
#import seaborn as sns
ox.config(log_console=True, use_cache=True)
ox.__version__

'0.16.2'

### Methods

In [2]:
# load monthly data of mode based on selected months
def load_data(mode, months):
    first_read = True
    for month in months:
        data_month = pd.concat([pd.read_csv(file) for file in glob.glob('Data/Modes/data_prepared_{}_{}.csv'.format(mode, month))], ignore_index = True)
        if (first_read):
            data_full = data_month
            first_read = False
        else:
            data_full = pd.concat([data_full, data_month], ignore_index=True)
    return data_full

# load full data of mode
def load_data_mode_full(mode):
    data_mode_full = pd.read_csv('Data/Modes/data_prepared_full_{}.csv'.format(mode))
    return data_mode_full

# calculate haversine distance to city center in m
def calculate_distance_to_city_center(data_full):
    data_full_d = data_full.copy()
    data_full_d['dist_center_start'] = data_full_d.apply(lambda r: int(haversine((r['latitude_start'], r['longitude_start']), (50.941724380890186, 6.958446824087053))*1000), axis=1) #convert Km to meter
    data_full_d['dist_center_end'] = data_full_d.apply(lambda r: int(haversine((r['latitude_end'], r['longitude_end']), (50.941724380890186, 6.958446824087053))*1000), axis=1) #convert Km to meter
    return data_full_d   

# supplement data with POI types
def supplement_data(data_mode):
    
    return data_mode

# save data by mode and months
def save_supplemented_data(months, data_supplemented):
    for month in months:
        # extract month, year and mode
        only_month = int(month[4:])
        only_year = int(month[:4])
        mode_name_file = data_supplemented['vehicleType'].iloc[0]
        # create dataframe and save it as csv file
        split_cond = [(data_supplemented['month'] == only_month) & (data_supplemented['year'] == only_year)]
        data_supplemented[split_cond[0]].to_csv('Data/Modes/data_supplemented_{}_{}.csv'.format(mode_name_file, month), index = False)

# save data by mode only
def save_supplemented_data_full(data_supplemented):
    mode_name_file = data_supplemented['vehicleType'].iloc[0]
    # create dataframe and save it as csv file
    data_supplemented.to_csv('Data/Modes/data_supplemented_full_{}.csv'.format(mode_name_file), index = False)

### Load Data

In [3]:
# define months for data loading (YYYYMM)
months = ['201911', '201912', '202001', '202002']
#load data by months
load_monthly_data = False
if (load_monthly_data):
    data_car = load_data('car', months)
    data_bicycle = load_data('bicycle', months)
    data_scooter = load_data('scooter', months)
    data_full = pd.concat([data_car, data_bicycle, data_scooter])

In [4]:
# read in desired data
# set True if full data set load is desired
load_full_data = True
if (load_full_data):
    data_car = load_data_mode_full('car')
    data_bicycle = load_data_mode_full('bicycle')
    data_scooter = load_data_mode_full('scooter')
    data_full = pd.concat([data_car, data_bicycle, data_scooter])

### Time Bucket Assignment

### Calculating Distance to City Center

In [5]:
# calculate distance to city center (Dom) for both origin and destination
data_full_dist = calculate_distance_to_city_center(data_full)

In [6]:
data_full_dist.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 410295 entries, 0 to 134387
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 410295 non-null  object 
 1   provider           410295 non-null  object 
 2   vehicleType        410295 non-null  object 
 3   date_start         410295 non-null  object 
 4   time_start         410295 non-null  int64  
 5   date_end           410295 non-null  object 
 6   time_end           410295 non-null  int64  
 7   year               410295 non-null  int64  
 8   month              410295 non-null  int64  
 9   weekday            410295 non-null  int64  
 10  hour               410295 non-null  int64  
 11  weekend            410295 non-null  int64  
 12  longitude_start    410295 non-null  float64
 13  latitude_start     410295 non-null  float64
 14  longitude_end      410295 non-null  float64
 15  latitude_end       410295 non-null  float64
 16  co

### POI Type Supplementation

In [76]:
place = 'Köln, Germany'
#help(ox.pois_from_place)

In [77]:
tagss = {'amenity': ['restaurant', 'pub', 'hotel'],
        'building': 'hotel',
        'tourism': 'hotel'}
#tags={'amenity':True}
tags={'aeroway': 'aerodrome'}
#tags={'emergency': 'emergency_ward_entrance'}

all_pois = ox.pois.pois_from_place(place=place, tags=tags)
len(all_pois)



2

In [63]:
all_pois.head()

Unnamed: 0,unique_id,osmid,element_type,source,geometry,nodes,aeroway,icao,name,operator,...,name:en,name:fr,name:pl,name:ru,name:tr,passengers,ref,short_name,type,wheelchair
0,way/30394413,30394413,way,yahoo,"POLYGON ((7.00148 51.01885, 7.00344 51.01675, ...","[335502616, 335717193, 335502618, 335502622, 3...",aerodrome,EDKL,Flugplatz Kurtekotten,Luftsportklub Bayer Leverkusen,...,,,,,,,,,,
1,relation/2269304,2269304,relation,,"POLYGON ((7.10959 50.87253, 7.10943 50.87562, ...","[[[1611111803, 2149996723, 2149996746, 2149996...",aerodrome,EDDK,Flughafen Köln/Bonn,,...,Cologne Bonn Airport,Aéroport Konrad-Adenauer de Cologne/Bonn,Port lotniczy Kolonia/Bonn,Аэропорт Кёльн/Бонн,Köln Bonn Havalimanı,9850000.0,CGN,FKB,multipolygon,yes


In [74]:
# aeroway POI for transportation but different radius for assignment (handeled seperately)
tags={'aeroway': 'terminal'}

POI_aero = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_aero)
#POI_aero[POI_aero['element_type']=='node'].info()

  and should_run_async(code)


11

In [83]:
# food and drink POI
tags = {'amenity': ['bar','biergarten','cafe','drinking_water','fast_food','food_court','ice_cream','pub','restaurant', 'internet_cafe']}
POI_food_drink = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_food_drink)

  and should_run_async(code)


3095

In [67]:
# education POI
tags = {'amenity': ['college','driving_school', 'kindergarten', 'language_school', 'library', 'music_school', 'school', 'university']}
POI_education = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_education[POI_education['element_type']=='node'])



308

In [79]:
# transportation POI
tags = {'amenity': ['bicycle_rental','boat_rental', 'boat_sharing', 'bus_station', 'car_rental', 'car_sharing', 'ferry_terminal', 'taxi'],
       'public_transport':['station'],
       'railway':['platform', 'station', 'tram_stop', 'subway_entrance']}
POI_transport = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_transport[POI_transport['element_type']=='node'])

  and should_run_async(code)


838

In [86]:
# healthcare POI
tags = {'amenity': ['clinic','dentist', 'doctors', 'hospital', 'nursing_home', 'pharmacy', 'social_facility', 'veterinary'],
       'emergency': ['emergency_ward_entrance']}
POI_healthcare = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_healthcare[POI_healthcare['element_type']=='node'])

  and should_run_async(code)


703

In [80]:
# finance POI
tags = {'amenity': ['atm','bank', 'bureau_de_change']}
POI_finance = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_finance[POI_finance['element_type']=='node'])

  and should_run_async(code)


363

In [92]:
# arts, culture and entertainment POI
tags = {'amenity': ['arts_centre','cinema', 'brothel', 'casino', 'community_centre', 'gambling', 'love_hotel', 'nightclub', 'planetarium', 'public_bookcase', 'social_centre', 'stripclub', 'studio', 'swingerclub', 'theatre']}
POI_art_culture_entertainment = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_art_culture_entertainment[POI_art_culture_entertainment['element_type']=='node'])



205

In [97]:
# tourism POI
tags = {'tourism': True}
POI_tourism = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_tourism[POI_tourism['element_type']=='node'])



712

In [95]:
# sport POI
tags = {'sport': True}
POI_sport = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_sport[POI_sport['element_type']=='node'])

  and should_run_async(code)


279

In [94]:
# shop POI
tags = {'shop': True}
POI_shop = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_shop[POI_shop['element_type']=='node'])

  and should_run_async(code)


5246

In [93]:
# office POI
tags = {'office': True}
POI_office = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_office[POI_office['element_type']=='node'])

  and should_run_async(code)


776

In [88]:
# history POI
tags = {'historic': True}
POI_history = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_history[POI_history['element_type']=='node'])

  and should_run_async(code)


2057

In [89]:
# leisure POI
tags = {'leisure': True}
POI_leisure = ox.pois.pois_from_place(place=place, tags=tags)
len(POI_leisure[POI_leisure['element_type']=='node'])

  and should_run_async(code)


727

In [99]:
# load POI type data sets

### Save data sets

In [None]:
# save supplemented data sets based on selected months
# split data by month to avoid file size > 100 MB due to GitHub rules
# set True if saving is desired
save_data = False
if (save_data):
    save_supplemented_data(months, data_car)
    save_supplemented_data(months, data_bicycle)
    save_supplemented_data(months, data_scooter)

In [None]:
# save supplemented data sets in one file per mode
# set True if saving is desired
save_data_full = False
if (save_data_full):
    save_supplemented_data_full(data_car)
    save_supplemented_data_full(data_bicycle)
    save_supplemented_data_full(data_scooter)

### Descriptive statistics

### Test

In [None]:
data_bicycle.isnull().values.any()

In [None]:
len(data_car[(data_car['weekday'] == 0) & (data_car['hour'] == 2)])

In [None]:
type(data_full["time_start"])

In [None]:
data_car['provider'].unique()

In [None]:
data_car['duration'].min()

In [None]:
data_car['coordinates_start'][0]