In [1]:
import json
import ast
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile

In [2]:
data = pd.read_csv('./data/test.csv')
data = data[['TAXI_ID', 'POLYLINE']]

In [3]:
coordinates = []
trips = np.array(data)
for i in range(len(trips)):
    point = trips[i]
    taxi_id = point[0]
    path = ast.literal_eval(point[1])
    coordinates.append([taxi_id, path])

In [4]:
def intermediates(p1, p2, nb_points=8):
    x_spacing = (p2[0] - p1[0]) / (nb_points + 1)
    y_spacing = (p2[1] - p1[1]) / (nb_points + 1)
    return [[p1[0] + i * x_spacing, p1[1] +  i * y_spacing] 
            for i in range(1, nb_points+1)]

In [5]:
print(intermediates([1, 2], [10, 6.5], nb_points=5))

[[2.5, 2.75], [4.0, 3.5], [5.5, 4.25], [7.0, 5.0], [8.5, 5.75]]


In [6]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0

def calculate_distance(lat2, lon2, lat1, lon1):
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = (sin(dlat/2))**2 + cos(lat1) * cos(lat2) * (sin(dlon/2))**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    distance = R * c
    return(abs(distance))

In [7]:
precision = 0.1 # 10 meters

def generate_points(coordinate_one, coordinate_two):
    distance = calculate_distance(coordinate_one[0], coordinate_one[1], coordinate_two[0],coordinate_two[1])
    if(distance >= precision):
        total_generated_points = int(abs(distance//precision))
        return intermediates(coordinate_one, coordinate_two, nb_points=total_generated_points)
    else:
        return intermediates(coordinate_one, coordinate_two, nb_points=1)

In [8]:
generate_points([1.002, 2.010], [1.001, 2.009])

[[1.0015, 2.0095]]

In [9]:
synthetic_data = []
for record in coordinates:
    taxi_id = record[0]
    orignal_path = record[1]
    generated_path = []
    for point_index in range(len(orignal_path) - 1):
        generated_path += generate_points(orignal_path[point_index], orignal_path[point_index + 1])
    synthetic_data.append([taxi_id, generated_path])

In [10]:
a = pd.DataFrame(synthetic_data, columns=['TAXI_ID', 'POLYLINE'])
b = pd.read_csv('./data/test.csv')

In [13]:
c = pd.concat([b, a], ignore_index=True, sort=False, axis=1)
c = c.drop([10, 9], axis=1)
c = pd.DataFrame(c.values, columns=['', 'TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'TIMESTAMP', 'DAY_TYPE', 'MISSING DATA', 'POLYLINE'])
c.to_csv('./data/synthetic_data.csv', index=False)

In [14]:
c.head()

Unnamed: 0,Unnamed: 1,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING DATA,POLYLINE
0,0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618571, 41.141394000000005], [-8.619108, ..."
1,1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.640099, 41.1598485], [-8.640966, 41.15995..."
2,2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.613171000000001, 41.1403545], [-8.6137965..."
3,3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.5746915, 41.151946499999994], [-8.5747004..."
4,4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.6459715, 41.1805035], [-8.645998500000001..."
