Bachelor thesis - Richard Borschke - 7337876 - University of Cologne - A spatio-temporal analysis of usage patterns in free-floating shared mobility


# Data Supplementation

This notebook contains the supplementation of data, needed for analysis. Following steps are performed:
* Data loading of prepared trip files
* Assignment of time bucket of trip
* Calculating distance to city center
* POI type supplementation (WHICH TYPES??)


time (18.11 to 29.2  +  05.11.) --> 5.11. excluded

data set misses 2 days (16. and 15. 01.)

car misses 4 days (06. to 09.12.)

### Imports

In [9]:
import pandas as pd
import glob
from haversine import haversine 

import numpy as np
import seaborn as sns

### Methods

In [17]:
# load monthly data of mode based on selected months
def load_data(mode, months):
    first_read = True
    for month in months:
        data_month = pd.concat([pd.read_csv(file) for file in glob.glob('Data/Modes/data_prepared_{}_{}.csv'.format(mode, month))], ignore_index = True)
        if (first_read):
            data_full = data_month
            first_read = False
        else:
            data_full = pd.concat([data_full, data_month], ignore_index=True)
    return data_full

# load full data of mode
def load_data_mode_full(mode):
    data_mode_full = pd.read_csv('Data/Modes/data_prepared_full_{}.csv'.format(mode))
    return data_mode_full

# calculate haversine distance to city center in m
def calculate_distance_to_city_center(data_full):
    data_full_d = data_full.copy()
    data_full_d['dist_center_start'] = data_full_d.apply(lambda r: int(haversine((r['latitude_start'], r['longitude_start']), (50.941724380890186, 6.958446824087053))*1000), axis=1) #convert Km to meter
    data_full_d['dist_center_end'] = data_full_d.apply(lambda r: int(haversine((r['latitude_end'], r['longitude_end']), (50.941724380890186, 6.958446824087053))*1000), axis=1) #convert Km to meter
    return data_full_d   

# supplement data with POI types
def supplement_data(data_mode):
    
    return data_mode

# save data by mode and months
def save_supplemented_data(months, data_supplemented):
    for month in months:
        # extract month, year and mode
        only_month = int(month[4:])
        only_year = int(month[:4])
        mode_name_file = data_supplemented['vehicleType'].iloc[0]
        # create dataframe and save it as csv file
        split_cond = [(data_supplemented['month'] == only_month) & (data_supplemented['year'] == only_year)]
        data_supplemented[split_cond[0]].to_csv('Data/Modes/data_supplemented_{}_{}.csv'.format(mode_name_file, month), index = False)

# save data by mode only
def save_supplemented_data_full(data_supplemented):
    mode_name_file = data_supplemented['vehicleType'].iloc[0]
    # create dataframe and save it as csv file
    data_supplemented.to_csv('Data/Modes/data_supplemented_full_{}.csv'.format(mode_name_file), index = False)

### Load Data

In [18]:
# define months for data loading (YYYYMM)
months = ['201911', '201912', '202001', '202002']
#load data by months
load_monthly_data = False
if (load_monthly_data):
    data_car = load_data('car', months)
    data_bicycle = load_data('bicycle', months)
    data_scooter = load_data('scooter', months)
    data_full = pd.concat([data_car, data_bicycle, data_scooter])

In [19]:
# read in desired data
# set True if full data set load is desired
load_full_data = True
if (load_full_data):
    data_car = load_data_mode_full('car')
    data_bicycle = load_data_mode_full('bicycle')
    data_scooter = load_data_mode_full('scooter')
    data_full = pd.concat([data_car, data_bicycle, data_scooter])

### Time Bucket Assignment

### Calculating Distance to City Center

In [20]:
# calculate distance to city center (Dom) for both origin and destination
data_full_dist = calculate_distance_to_city_center(data_full)

In [25]:
data_full_dist.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 410295 entries, 0 to 134387
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 410295 non-null  object 
 1   provider           410295 non-null  object 
 2   vehicleType        410295 non-null  object 
 3   date_start         410295 non-null  object 
 4   time_start         410295 non-null  int64  
 5   date_end           410295 non-null  object 
 6   time_end           410295 non-null  int64  
 7   year               410295 non-null  int64  
 8   month              410295 non-null  int64  
 9   weekday            410295 non-null  int64  
 10  hour               410295 non-null  int64  
 11  weekend            410295 non-null  int64  
 12  longitude_start    410295 non-null  float64
 13  latitude_start     410295 non-null  float64
 14  longitude_end      410295 non-null  float64
 15  latitude_end       410295 non-null  float64
 16  co

### POI Type Supplementation

In [None]:
# load POI type data sets

### Save data sets

In [7]:
# save supplemented data sets based on selected months
# split data by month to avoid file size > 100 MB due to GitHub rules
# set True if saving is desired
save_data = False
if (save_data):
    save_supplemented_data(months, data_car)
    save_supplemented_data(months, data_bicycle)
    save_supplemented_data(months, data_scooter)

In [8]:
# save supplemented data sets in one file per mode
# set True if saving is desired
save_data_full = False
if (save_data_full):
    save_supplemented_data_full(data_car)
    save_supplemented_data_full(data_bicycle)
    save_supplemented_data_full(data_scooter)

### Descriptive statistics

### Test

In [17]:
data_bicycle.isnull().values.any()

False

In [18]:
len(data_car[(data_car['weekday'] == 0) & (data_car['hour'] == 2)])

229

In [19]:
type(data_full["time_start"])

pandas.core.series.Series

In [20]:
data_car['provider'].unique()

array(['car2go'], dtype=object)

In [24]:
data_car['duration'].min()

234.0

In [25]:
data_car['coordinates_start'][0]

(50.9479, 6.887)