In [20]:
import json
import ast
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile

In [21]:
data = pd.read_csv('./data/synthetic_data.csv').head(1000)

In [22]:
data = data[['TAXI_ID', 'POLYLINE']]

In [23]:
print('Total Path Trail Records used', len(data))

Total Path Trail Records used 1000


In [24]:
taxi_id = np.unique(data['TAXI_ID'])

In [25]:
print('Total People', len(taxi_id))

Total People 309


In [26]:
coordinates = {}
trips = np.array(data)
for i in range(len(trips)):
    point = trips[i]
    keys = list(coordinates.keys())
    if point[0] in keys:
        appended_coordinate = ast.literal_eval(point[1]) + coordinates[point[0]]
        coordinates[point[0]] = appended_coordinate
    else:
        coordinates[point[0]] = ast.literal_eval(point[1])

In [27]:
taxi_records = [[key, value, 0] for key, value in coordinates.items()]

In [28]:
coronavirus_cases_count = 5
coronavirus_cases = random.sample(range(1, len(taxi_records)), coronavirus_cases_count)
coronavirus_cases = [137, 22, 284, 178, 183] # comment the statement for random selection
random_records = [taxi_records[i][0] for i in coronavirus_cases]

In [29]:
random_records

[20000304, 20000009, 20000156, 20000455, 20000281]

In [30]:
from math import sin, cos, sqrt, atan2, radians
R = 6373.0

def calculate_distance(lat2, lon2, lat1, lon1):
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = (sin(dlat/2))**2 + cos(lat1) * cos(lat2) * (sin(dlon/2))**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    distance = R * c
    return(abs(distance))

In [31]:
threshold = 0.010

def check_if_contact(lat2, lon2, lat1, lon1):
    calculated_value = calculate_distance(lat1, lon1, lat2, lon2)
    if(calculated_value<=threshold):
        return True
    else:
        return False

In [32]:
lat1 = 52.2296756
lon1 = 21.0122287
lat2 = 52.2307756
lon2 = 21.0122287
check_if_contact(lat1, lon1, lat2, lon2)

False

In [33]:
distances = []
for i in taxi_records:
    person_travel_history = i[1]
    max_distance = 0
    for j in range(len(person_travel_history)-1):
        one = person_travel_history[j]
        two = person_travel_history[j+1]
        flag = calculate_distance(one[0], one[1], two[0], two[1])
        if(flag > max_distance):
            max_distance = flag
    distances.append(max_distance)
print('Max distance between any two coordinates', max(distances))

Max distance between any two coordinates 17.403047835066882


In [34]:
def calculate_total_encounters(history_one, history_two):
    count = 0
    for i in history_one:
        for j in history_two:
            if(check_if_contact(i[0], i[1], j[0], j[1])):
                count +=1 
    return count

In [35]:
chances = []
for i in taxi_records:
    chance = 0
    if i[0] not in random_records:
        person_travel_history = i[1]
        for coronavirus_case in coronavirus_cases:
            coronavirus_patient_travel_history = taxi_records[coronavirus_case][1]
            chance += calculate_total_encounters(person_travel_history, coronavirus_patient_travel_history)
    chances.append(chance)
    print(i[0], chance)

20000589 1
20000596 55
20000320 79
20000520 34
20000337 5
20000231 39
20000456 7
20000011 41
20000403 70
20000233 24
20000571 24
20000497 8
20000570 22
20000005 23
20000089 27
20000423 10
20000657 9
20000309 1
20000161 2
20000178 30
20000235 54
20000653 20
20000009 0
20000648 25
20000424 1
20000010 23
20000372 41
20000686 23
20000435 28
20000154 18
20000060 29
20000167 25
20000503 12
20000621 68
20000463 14
20000612 20
20000360 16
20000574 88
20000173 9
20000560 10
20000492 27
20000112 21
20000305 33
20000004 40
20000620 24
20000671 36
20000341 29
20000015 26
20000307 44
20000171 3
20000201 10
20000007 28
20000199 34
20000611 9
20000398 33
20000067 18
20000569 63
20000473 11
20000367 74
20000672 59
20000190 37
20000600 12
20000496 58
20000013 12
20000454 103
20000534 49
20000195 27
20000406 9
20000325 28
20000101 36
20000632 38
20000308 33
20000333 28
20000481 7
20000486 34
20000450 53
20000545 3
20000665 23
20000012 46
20000688 66
20000540 38
20000436 32
20000517 112
20000153 31
20000

In [36]:
total_chance_record = []
for i in range(len(taxi_records)):
    taxi_record = taxi_records[i]
    record = [
        taxi_record[0], # taxi_id
        taxi_record[1], # path or trail
        chances[i] # total chances
    ]
    total_chance_record.append(record)

In [37]:
pd.DataFrame(total_chance_record)

Unnamed: 0,0,1,2
0,20000589,"[[-8.649639, 41.1541515], [-8.6502105, 41.1539...",1
1,20000596,"[[-8.64405, 41.1588855], [-8.6440365, 41.15893...",55
2,20000320,"[[-8.606529, 41.1479325], [-8.606403, 41.14800...",79
3,20000520,"[[-8.642146499999999, 41.1650145], [-8.6421555...",34
4,20000337,"[[-8.6603985, 41.15871], [-8.659701, 41.159758...",5
...,...,...,...
304,20000535,"[[-8.604045, 41.1612705], [-8.604675, 41.16172...",0
305,20000054,"[[-8.606569499999999, 41.144742], [-8.6069385,...",11
306,20000108,"[[-8.653203, 41.168565], [-8.653239, 41.168898...",0
307,20000118,"[[-8.574754500000001, 41.142703499999996], [-8...",20


In [38]:
total_encounter = pd.DataFrame(total_chance_record)
total_encounter = total_encounter.sort_values(by=2, ascending=False)
total_encounter.to_csv('./data/synthetic_data_chances.csv', index = False)