In [1]:
# Import libraries
import pandas as pd
import geopandas
from shapely.geometry import Point, LineString

In [40]:
# Use English column names (original in Catalan)
columns_eng = ['SectionID', 'Description', 'Coordinates']

# Read the file with the Tram ID as index (Skiprows=1 to remove the original column names)
transits = pd.read_csv('./data/transit_relacio_trams.csv', index_col=0, names=columns_eng, skiprows=1)

transits.head()

Unnamed: 0_level_0,Description,Coordinates
SectionID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Diagonal (Ronda de Dalt a Doctor Marañón),"2.11203535639414,41.3841912394771,2.1015028628..."
2,Diagonal (Doctor Marañón a Ronda de Dalt),"2.111944376806616,41.38446666680338,2.10159408..."
3,Diagonal (Doctor Marañón a Pl. Pius XII),"2.112093343037027,41.38422850920645,2.12264979..."
4,Diagonal (Pl. Pius XII a Doctor Marañón),"2.122592049318304,41.38719094189204,2.11196902..."
5,Diagonal (Pl. Pius XII a Pl. Maria Cristina),"2.122657659295115,41.38694195794678,2.12755961..."


In [42]:
# Is there any rows without its coordinate values?
transits['Coordinates'].str.split(',', expand=True).loc[:,:1].isnull().sum()

0    0
1    0
dtype: int64

In [45]:
# Is there any duplicated values?
transits['Coordinates'].duplicated().sum()

0

In [47]:
# Are the datatype correctly assigned?
# 'Coordinates' should be numeric, not string
transits.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
Int64Index: 492 entries, 1 to 534
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Description  492 non-null    object
 1   Coordinates  492 non-null    object
dtypes: object(2)
memory usage: 11.5+ KB


In [48]:
# Separate elements: split the coordinates by ','
elements = [row.split(",") for row in transits.Coordinates]

# Check is there corrupted data that are not separated by ','
# We can approach this by examining the size of each coordinate list
# which should be an even number
def get_index_not_separated(list):
    for i, element in enumerate(list):
        if len(element)%2 != 0:
            print(f'Check index {i} again')    # print the index number
            print(list[i])

get_index_not_separated(elements)


Check index 241 again
[&#39;2.146454467868266 41.393027770024275&#39;, &#39;2.1516317023376 41.389144874554006&#39;, &#39;2.1539248285035 41.38737437733698&#39;, &#39;2.1580025178065 41.38423619248701&#39;, &#39;2.1635456700879 41.380259228103&#39;]


In [49]:
# Let's fix this 241st element using split method
element_241_split = [e.split() for e in elements[241]]

element_241 = []
for el in element_241_split:
     element_241 += el

element_241

[&#39;2.146454467868266&#39;,
 &#39;41.393027770024275&#39;,
 &#39;2.1516317023376&#39;,
 &#39;41.389144874554006&#39;,
 &#39;2.1539248285035&#39;,
 &#39;41.38737437733698&#39;,
 &#39;2.1580025178065&#39;,
 &#39;41.38423619248701&#39;,
 &#39;2.1635456700879&#39;,
 &#39;41.380259228103&#39;]

In [50]:
elements[241] = element_241

In [54]:
# First, change the list items to numeric(float)
for i, list in enumerate(elements):
    string_to_float = [float(item) for item in list]
    transits.iloc[i,1] = string_to_float

transits.head()

Unnamed: 0_level_0,Description,Coordinates
SectionID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Diagonal (Ronda de Dalt a Doctor Marañón),"[2.11203535639414, 41.3841912394771, 2.1015028..."
2,Diagonal (Doctor Marañón a Ronda de Dalt),"[2.111944376806616, 41.38446666680338, 2.10159..."
3,Diagonal (Doctor Marañón a Pl. Pius XII),"[2.112093343037027, 41.38422850920645, 2.12264..."
4,Diagonal (Pl. Pius XII a Doctor Marañón),"[2.122592049318304, 41.38719094189204, 2.11196..."
5,Diagonal (Pl. Pius XII a Pl. Maria Cristina),"[2.122657659295115, 41.38694195794678, 2.12755..."


'Coordinates' column is finally in numeric form.

How can we convert the list of several coordinates into LineString data type of Geopandas?

In [None]:
# Sudocodes

In [55]:
# for i, e in enumerate(elements):
#     # Check if the coordinates are all in pairs
#     if len(e)%2 == 0:
#         #zip the coordinates into a point object and convert to a GeoData Frame
#         geo_df.iloc[i] = [(e[n], e[n+1]) for n in range(0,len(e),2)]
#         geometry = [Point(xy) for xy in zip(e[n], e[n+1])]


In [47]:
transit_trams_clean = pd.concat([transit_trams, coordinates], axis=1)
transit_trams_clean = transit_trams_clean[['Tram', 'Descripció', 'Latitude', 'Longitude']]
transit_trams_clean.head()

Unnamed: 0,Tram,Descripció,Latitude,Longitude
0,1,Diagonal (Ronda de Dalt a Doctor Marañón),41.3841912394771,2.11203535639414
1,2,Diagonal (Doctor Marañón a Ronda de Dalt),41.38446666680338,2.111944376806616
2,3,Diagonal (Doctor Marañón a Pl. Pius XII),41.38422850920645,2.112093343037027
3,4,Diagonal (Pl. Pius XII a Doctor Marañón),41.38719094189204,2.122592049318304
4,5,Diagonal (Pl. Pius XII a Pl. Maria Cristina),41.38694195794678,2.122657659295115


In [48]:
transit_trams_clean.to_csv("./data/transit_trams.csv")