# Introduction

A CSV file containing coordinates of traffic sections of Barcelona (available on Open Data BCN) is loaded.

In [1]:
# Import libraries
import pandas as pd
import geopandas
from shapely.geometry import Point, LineString

In [2]:
# Use English column names (original in Catalan)
columns_eng = ['SectionID', 'Description', 'Coordinates']

# Read the file with the Section ID as index (Skiprows=1 to remove the original column names)
transits = pd.read_csv('./data/transit_relacio_trams.csv', index_col=0, names=columns_eng, skiprows=1)

transits.head()

Unnamed: 0_level_0,Description,Coordinates
SectionID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Diagonal (Ronda de Dalt a Doctor Marañón),"2.11203535639414,41.3841912394771,2.1015028628..."
2,Diagonal (Doctor Marañón a Ronda de Dalt),"2.111944376806616,41.38446666680338,2.10159408..."
3,Diagonal (Doctor Marañón a Pl. Pius XII),"2.112093343037027,41.38422850920645,2.12264979..."
4,Diagonal (Pl. Pius XII a Doctor Marañón),"2.122592049318304,41.38719094189204,2.11196902..."
5,Diagonal (Pl. Pius XII a Pl. Maria Cristina),"2.122657659295115,41.38694195794678,2.12755961..."


## Sanity check
1. missing values
2. duplicated values
3. data type assignments

In [3]:
# Is there any rows missing coordinate values?
transits['Coordinates'].str.split(',', expand=True).loc[:,:1].isnull().sum()

0    0
1    0
dtype: int64

In [4]:
# Is there any duplicated values?
transits['Coordinates'].duplicated().sum()

0

In [5]:
# Are the datatype correctly assigned?
# 'Coordinates' should be numeric, not string
transits.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 492 entries, 1 to 534
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Description  492 non-null    object
 1   Coordinates  492 non-null    object
dtypes: object(2)
memory usage: 11.5+ KB


Since this was a 2-day sprint, I used only the first coordinates as a representative point for each street section. However, for a better interpretation of the data, this part should be revisited in the near future:

To do: Convert the list of several coordinates into LineString data type of Geopandas?

In [6]:
# Select the first pair of coordinates as representative geolocation for each section\n",
first_pair = transits['Coordinates'].str.split(',', expand=True).loc[:,:1]

transits_to_export = transits[['Description']]
transits_to_export = transits_to_export.assign(Longitude = first_pair[0], Latitude = first_pair[1])
transits_to_export.head()

Unnamed: 0_level_0,Description,Longitude,Latitude
SectionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Diagonal (Ronda de Dalt a Doctor Marañón),2.11203535639414,41.3841912394771
2,Diagonal (Doctor Marañón a Ronda de Dalt),2.111944376806616,41.38446666680338
3,Diagonal (Doctor Marañón a Pl. Pius XII),2.112093343037027,41.38422850920645
4,Diagonal (Pl. Pius XII a Doctor Marañón),2.122592049318304,41.38719094189204
5,Diagonal (Pl. Pius XII a Pl. Maria Cristina),2.122657659295115,41.38694195794678


# END (6 Sep 2020)
--------------------------

# 

# On going (Nov 2020)

In [22]:
# Select the first pair of coordinates as representative geolocation for each section
transits

Unnamed: 0_level_0,Description,Coordinates
SectionID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Diagonal (Ronda de Dalt a Doctor Marañón),"2.11203535639414,41.3841912394771,2.1015028628..."
2,Diagonal (Doctor Marañón a Ronda de Dalt),"2.111944376806616,41.38446666680338,2.10159408..."
3,Diagonal (Doctor Marañón a Pl. Pius XII),"2.112093343037027,41.38422850920645,2.12264979..."
4,Diagonal (Pl. Pius XII a Doctor Marañón),"2.122592049318304,41.38719094189204,2.11196902..."
5,Diagonal (Pl. Pius XII a Pl. Maria Cristina),"2.122657659295115,41.38694195794678,2.12755961..."
...,...,...
530,Ronda Litoral (Passeig de la Zona Franca a Est...,"2.14689, 41.34946, 2.14681, 41.34943, 2.14622,..."
531,Ronda Litoral (Gran Via a Potosí),"2.206259340928256,41.44361502134825,2.20757431..."
532,Ronda Litoral (Potosí a Gran Via),"2.206062333803802,41.44356616311283,2.20747237..."
533,Ronda Litoral (Potosí a Nus de la Trinitat),"2.206243003622748,41.44362108033077,2.20220784..."


In [31]:
# Select the first pair of coordinates as representative geolocation for each section
first_pair = transits['Coordinates'].str.split(',', expand=True).loc[:,:1]
transits = transits.assign(Longitude = first_pair[0], Latitude = first_pair[1])
transits = transits[['Description', 'Latitude', 'Longitude']]
transits.head()

Unnamed: 0_level_0,Description,Latitude,Longitude
SectionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Diagonal (Ronda de Dalt a Doctor Marañón),41.3841912394771,2.11203535639414
2,Diagonal (Doctor Marañón a Ronda de Dalt),41.38446666680338,2.111944376806616
3,Diagonal (Doctor Marañón a Pl. Pius XII),41.38422850920645,2.112093343037027
4,Diagonal (Pl. Pius XII a Doctor Marañón),41.38719094189204,2.122592049318304
5,Diagonal (Pl. Pius XII a Pl. Maria Cristina),41.38694195794678,2.122657659295115


In [7]:
# Export the result dataframe as csv file
# to import in the second notebook for analysis & forecast
transits_final.to_csv("./data/transit_trams.csv")

In [8]:
# Separate elements of 'Coordinates' column: split each pair of (latitude, longitude) by ','
elements = [row.split(",") for row in transits.Coordinates]

len(elements)

492

In [9]:
# Check is there corrupted data that are not separated by ','
# By examining the size of each coordinate list (is it a even number?)
def get_index_not_separated(list):
    for i, element in enumerate(list):
        if len(element)%2 != 0: # list size is an odd number
            print(f'Check index {i} again')    # print the index number
            print(list[i])

get_index_not_separated(elements)

Check index 241 again
['2.146454467868266 41.393027770024275', '2.1516317023376 41.389144874554006', '2.1539248285035 41.38737437733698', '2.1580025178065 41.38423619248701', '2.1635456700879 41.380259228103']


In [20]:
transits_clean = transits.copy()

transits_clean.apply(lambda x: elements)
transits_clean.dtypes

Description    object
Coordinates    object
dtype: object

In [49]:
# Change the list items to numeric(float) values
# and replace the 'Coordinates' column.
transits_clean.Coordinates = [list(map(float, element)) for element in elements]
transits_clean.head()

Unnamed: 0_level_0,Description,Coordinates
SectionID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Diagonal (Ronda de Dalt a Doctor Marañón),"[2.11203535639414, 41.3841912394771, 2.1015028..."
2,Diagonal (Doctor Marañón a Ronda de Dalt),"[2.111944376806616, 41.38446666680338, 2.10159..."
3,Diagonal (Doctor Marañón a Pl. Pius XII),"[2.112093343037027, 41.38422850920645, 2.12264..."
4,Diagonal (Pl. Pius XII a Doctor Marañón),"[2.122592049318304, 41.38719094189204, 2.11196..."
5,Diagonal (Pl. Pius XII a Pl. Maria Cristina),"[2.122657659295115, 41.38694195794678, 2.12755..."


In [10]:
# Let's fix this 241st element using split method
element_241_split = [e.split() for e in elements[241]]

element_241 = []
for el in element_241_split:
     element_241 += el

element_241

['2.146454467868266',
 '41.393027770024275',
 '2.1516317023376',
 '41.389144874554006',
 '2.1539248285035',
 '41.38737437733698',
 '2.1580025178065',
 '41.38423619248701',
 '2.1635456700879',
 '41.380259228103']

In [11]:
# Replace the values
elements[241] = element_241

In [None]:
# Points to string
