# Exploratory Data Analysis on Intermediate Data

In [75]:
#general libs
import json
import numpy as np
import pandas as pd
from time import sleep
from tqdm import tqdm

#geodata libs
import folium
import openrouteservice

Global variables (api keys and other stuff)

In [76]:
secrets = json.load(open('..\secrets.json'))

KEY_OPENROUTESERVICE = secrets['OPENROUTESERVICE']['API_KEY']


In [77]:
df = pd.read_csv('../data/intermediate/data_intermediate.csv', sep=';')

#correcting datatype
df['date'] = pd.to_datetime(df['date'])

df.head()

Unnamed: 0,date,year,latitude,longitude
0,2016-06-21 21:02:54,2016,-23.582274,-46.685666
1,2016-06-21 21:05:08,2016,-23.585315,-46.688474
2,2016-06-21 21:07:17,2016,-23.585046,-46.691041
3,2016-06-21 21:07:44,2016,-23.58458,-46.691775
4,2016-06-21 21:08:59,2016,-23.586483,-46.695554


In [78]:
df['date'].dt.year.value_counts()

2017    193002
2016    175833
2018     34742
2019     14534
2021      6526
2020      4089
2022      1772
Name: date, dtype: int64

### Hotwheels protocol

Limitting the volume of data during tests

In [79]:
df = df.head(10)

Defining default coordinates when plotting map

In [80]:
lat_default = df['latitude'].mean()
long_default = df['longitude'].mean()

print(lat_default)
print(long_default)

#folium.Map(location=[lat_default, long_default], zoom_start=2)

-23.57891611
-46.697245179999996


### Step 1: create the map with coordinates

In [81]:
#generate a new map
map = folium.Map(location=[lat_default, long_default], zoom_start=15, tiles='cartodbpositron')

#for each row in the data, add a cicle marker
for index, row in tqdm(df.iterrows(), total=df.shape[0]):

    #add starting location markers to the map
    folium.CircleMarker(location=[row['latitude'], row['longitude']],
                        color='red',
                        radius=5,
                        weight=1,
                        fill=True).add_to(map)

map

100%|██████████| 10/10 [00:00<00:00, 9995.96it/s]


### Step 2: connecting the dots

In [82]:
df.sort_values('date', ascending=True, inplace=True)

Match coordinates

In [83]:
#matching current coordinates with next coordinates
df_shifted_coordinates = df[['latitude','longitude']].shift(periods=-1, axis=0)
df_shifted_coordinates.set_axis(['latitude_next','longitude_next'], axis=1, inplace=True)

df = pd.concat([df, df_shifted_coordinates], axis=1)
df.head()

Unnamed: 0,date,year,latitude,longitude,latitude_next,longitude_next
0,2016-06-21 21:02:54,2016,-23.582274,-46.685666,-23.585315,-46.688474
1,2016-06-21 21:05:08,2016,-23.585315,-46.688474,-23.585046,-46.691041
2,2016-06-21 21:07:17,2016,-23.585046,-46.691041,-23.58458,-46.691775
3,2016-06-21 21:07:44,2016,-23.58458,-46.691775,-23.586483,-46.695554
4,2016-06-21 21:08:59,2016,-23.586483,-46.695554,-23.58412,-46.698981


Use OpenRouteService API

In [101]:
path_list = []
reserve = []

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    # I included try / except as a precaution in case any paths are extremely long, which we'll skip. 
    # I noticed this exception error when I accidentally generated a lat / long for no address. Be aware of this and remove prior to using this function. 
    try:
        # coordinates of the trips living within specific table columns.
        coords = ((row['longitude'],row['latitude']),(row['longitude_next'],row['latitude_next']))
        
        # Specify your personal API key
        client = openrouteservice.Client(key=KEY_OPENROUTESERVICE) 
        geometry = client.directions(coords)['routes'][0]['geometry']
        decoded = openrouteservice.convert.decode_polyline(geometry)
        
        # We need to reverse the long / lat output from results so that we can graph lat / long
        reverse = [(y, x) for x, y in decoded['coordinates']]        
        
        # Append each route to the path_list list
        path_list.append(reverse)
        
        # confirmation of each route being processed. Feel free to comment out.
        sleep(1)


    except:
        pass

100%|██████████| 10/10 [00:17<00:00,  1.78s/it]


In [85]:
coords

((-46.7102337, -23.5587264), (nan, nan))

In [86]:
'''
i = 0
lat0 = df.loc[i, 'latitude']
lon0 = df.loc[i, 'longitude']
lat1 = df.loc[i, 'latitude_next']
lon1 = df.loc[i, 'longitude_next']

coords = ((lon0, lat0),(lon1, lat1))

# Specify your personal API key
client = openrouteservice.Client(key=KEY_OPENROUTESERVICE) 
geometry = client.directions(coords)['routes'][0]['geometry']
decoded = openrouteservice.convert.decode_polyline(geometry)

# We need to reverse the long / lat output from results so that we can graph lat / long
reverse = [(y, x) for x, y in decoded['coordinates']]   
'''

In [87]:
pd.DataFrame(reverse, columns=['latitude','longitude']).to_csv('teste.csv', index=False)

In [88]:
teste = pd.read_csv('teste.csv')
teste

Unnamed: 0,latitude,longitude
0,-23.58226,-46.68562
1,-23.58233,-46.68560
2,-23.58258,-46.68585
3,-23.58298,-46.68626
4,-23.58306,-46.68635
...,...,...
62,-23.58548,-46.68866
63,-23.58547,-46.68864
64,-23.58540,-46.68857
65,-23.58536,-46.68853


In [93]:
reverse

[(-23.57132, -46.70523),
 (-23.57103, -46.70543),
 (-23.57065, -46.70566),
 (-23.57052, -46.70572),
 (-23.57049, -46.70574),
 (-23.57035, -46.70583),
 (-23.5702, -46.70594),
 (-23.57014, -46.70598),
 (-23.56938, -46.70649),
 (-23.56947, -46.70664),
 (-23.57028, -46.70609),
 (-23.57037, -46.70603),
 (-23.57044, -46.70597),
 (-23.57049, -46.70606),
 (-23.57086, -46.70671),
 (-23.5709, -46.70679),
 (-23.57137, -46.70759),
 (-23.57139, -46.70764),
 (-23.57144, -46.70773),
 (-23.57149, -46.70778),
 (-23.57154, -46.70782),
 (-23.57158, -46.70767),
 (-23.57225, -46.70557),
 (-23.57228, -46.70546),
 (-23.57246, -46.70489),
 (-23.57247, -46.70484),
 (-23.57253, -46.70467),
 (-23.57257, -46.70449),
 (-23.57259, -46.70437),
 (-23.57264, -46.7042),
 (-23.57261, -46.70411),
 (-23.57257, -46.70395),
 (-23.5725, -46.70354),
 (-23.57246, -46.70328),
 (-23.57244, -46.70306),
 (-23.57242, -46.70282),
 (-23.57241, -46.70256),
 (-23.57241, -46.70225),
 (-23.57242, -46.70208),
 (-23.5724, -46.70185),
 (-23

In [90]:
line = folium.PolyLine(
    reverse,
    weight=1,
    color='green'
).add_to(map)


map

### Step 3: plot

In [104]:
path_list

[[(-23.58226, -46.68562),
  (-23.58233, -46.6856),
  (-23.58258, -46.68585),
  (-23.58298, -46.68626),
  (-23.58306, -46.68635),
  (-23.5831, -46.68639),
  (-23.58463, -46.68799),
  (-23.58542, -46.68878),
  (-23.58559, -46.68898),
  (-23.58587, -46.68927),
  (-23.58601, -46.68942),
  (-23.58638, -46.68983),
  (-23.58648, -46.68996),
  (-23.58655, -46.69008),
  (-23.58659, -46.69019),
  (-23.58661, -46.6903),
  (-23.58662, -46.69043),
  (-23.5867, -46.69262),
  (-23.5867, -46.69267),
  (-23.58661, -46.69284),
  (-23.58651, -46.69297),
  (-23.58631, -46.69309),
  (-23.58627, -46.69311),
  (-23.58587, -46.69321),
  (-23.58568, -46.6932),
  (-23.58559, -46.69317),
  (-23.58553, -46.69311),
  (-23.58545, -46.69301),
  (-23.58544, -46.69293),
  (-23.58545, -46.69284),
  (-23.58547, -46.69275),
  (-23.58553, -46.69265),
  (-23.58561, -46.69256),
  (-23.5857, -46.69249),
  (-23.58587, -46.69238),
  (-23.58611, -46.69233),
  (-23.58633, -46.6923),
  (-23.58651, -46.69229),
  (-23.58701, -46.69

In [105]:
for path in path_list:
    
    line = folium.PolyLine(
        path,
        weight=1,
        color='#0A8A9F'
    ).add_to(map)

map