In [88]:
# Import libraries
import pandas as pd
import geopandas
from shapely.geometry import Point, LineString

In [119]:
transit_trams = pd.read_csv('./data/transit_relacio_trams.csv')

In [120]:
transit_trams.head()

Unnamed: 0,Tram,Descripció,Coordenades
0,1,Diagonal (Ronda de Dalt a Doctor Marañón),"2.11203535639414,41.3841912394771,2.1015028628..."
1,2,Diagonal (Doctor Marañón a Ronda de Dalt),"2.111944376806616,41.38446666680338,2.10159408..."
2,3,Diagonal (Doctor Marañón a Pl. Pius XII),"2.112093343037027,41.38422850920645,2.12264979..."
3,4,Diagonal (Pl. Pius XII a Doctor Marañón),"2.122592049318304,41.38719094189204,2.11196902..."
4,5,Diagonal (Pl. Pius XII a Pl. Maria Cristina),"2.122657659295115,41.38694195794678,2.12755961..."


In [91]:
# Is there any rows without its coordinate values?
transit_trams['Coordenades'].str.split(',', expand=True).loc[:,:1].isnull().sum()

0    0
1    0
dtype: int64

In [121]:
transit_trams.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 492 entries, 0 to 491
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Tram         492 non-null    int64 
 1   Descripció   492 non-null    object
 2   Coordenades  492 non-null    object
dtypes: int64(1), object(2)
memory usage: 11.7+ KB


In [139]:
# Initiate a GeoDataFrame using the cleaned coordinates

geo_df = geopandas.GeoDataFrame(transit_trams['Coordenades'])
geo_df.head()

Unnamed: 0,Coordenades
0,"2.11203535639414,41.3841912394771,2.1015028628..."
1,"2.111944376806616,41.38446666680338,2.10159408..."
2,"2.112093343037027,41.38422850920645,2.12264979..."
3,"2.122592049318304,41.38719094189204,2.11196902..."
4,"2.122657659295115,41.38694195794678,2.12755961..."


In [140]:
# Separate elements: split the coordinates by ','
elements = [row.split(",") for row in geo_df.Coordenades]

# Check is there corrupted data that are not separated by ','
# We can approach this by examining the size of each coordinate list
# which should be an even number
def get_index_not_separated(list):
    for i, element in enumerate(list):
        if len(element)%2 != 0:
            print(f'Check index {i} again')    # print the index number
            print(list[i])

get_index_not_separated(elements)


Check index 241 again
[&#39;2.146454467868266 41.393027770024275&#39;, &#39;2.1516317023376 41.389144874554006&#39;, &#39;2.1539248285035 41.38737437733698&#39;, &#39;2.1580025178065 41.38423619248701&#39;, &#39;2.1635456700879 41.380259228103&#39;]


In [154]:
# Let's get fix this 241st element

element_241_split = [e.split() for e in elements[241]]

element_241 = []
for el in element_241_split:
     element_241 += el

element_241

[&#39;2.146454467868266&#39;,
 &#39;41.393027770024275&#39;,
 &#39;2.1516317023376&#39;,
 &#39;41.389144874554006&#39;,
 &#39;2.1539248285035&#39;,
 &#39;41.38737437733698&#39;,
 &#39;2.1580025178065&#39;,
 &#39;41.38423619248701&#39;,
 &#39;2.1635456700879&#39;,
 &#39;41.380259228103&#39;]

In [158]:
elements[241] = element_241

In [169]:
# First, change the list items to numeric(float)
for i, list in enumerate(elements):
    items_to_float = [float(item) for item in list]
    geo_df.iloc[i,0] = items_to_float

geo_df.head()

Unnamed: 0,Coordenades
0,"[2.11203535639414, 41.3841912394771, 2.1015028..."
1,"[2.111944376806616, 41.38446666680338, 2.10159..."
2,"[2.112093343037027, 41.38422850920645, 2.12264..."
3,"[2.122592049318304, 41.38719094189204, 2.11196..."
4,"[2.122657659295115, 41.38694195794678, 2.12755..."


In [None]:
for i, e in enumerate(elements):
    # Check if the coordinates are all in pairs
    if len(e)%2 == 0:
        #zip the coordinates into a point object and convert to a GeoData Frame
        geometry = [Point(xy) for xy in zip(df.X, df.Y)]
        coordinates.iloc[i] = [(e[n], e[n+1]) for n in range(0,len(e),2)]

coordinates.head()

In [84]:
geo_df.Coordenades

0      [(2.11203535639414, 41.3841912394771), (2.1015...
1      [(2.111944376806616, 41.38446666680338), (2.10...
2      [(2.112093343037027, 41.38422850920645), (2.12...
3      [(2.122592049318304, 41.38719094189204), (2.11...
4      [(2.122657659295115, 41.38694195794678), (2.12...
                             ...                        
487    [(2.14689,  41.34946), ( 2.14681,  41.34943), ...
488    [(2.206259340928256, 41.44361502134825), (2.20...
489    [(2.206062333803802, 41.44356616311283), (2.20...
490    [(2.206243003622748, 41.44362108033077), (2.20...
491    [(2.206062333803802, 41.44356616311283), (2.20...
Name: Coordenades, Length: 492, dtype: object

In [46]:
print(transit_trams.shape)
print(coordinates.shape)

(492, 3)
(492, 2)


In [47]:
transit_trams_clean = pd.concat([transit_trams, coordinates], axis=1)
transit_trams_clean = transit_trams_clean[['Tram', 'Descripció', 'Latitude', 'Longitude']]
transit_trams_clean.head()

Unnamed: 0,Tram,Descripció,Latitude,Longitude
0,1,Diagonal (Ronda de Dalt a Doctor Marañón),41.3841912394771,2.11203535639414
1,2,Diagonal (Doctor Marañón a Ronda de Dalt),41.38446666680338,2.111944376806616
2,3,Diagonal (Doctor Marañón a Pl. Pius XII),41.38422850920645,2.112093343037027
3,4,Diagonal (Pl. Pius XII a Doctor Marañón),41.38719094189204,2.122592049318304
4,5,Diagonal (Pl. Pius XII a Pl. Maria Cristina),41.38694195794678,2.122657659295115


In [48]:
transit_trams_clean.to_csv("./data/transit_trams.csv")