## How To Use This Notebook

1. Copy osm_pbf_data_extractor.py into the first cell

# Main Script

In [1]:
# This script does the following
# 1. Downloads OSM files for specified countries from Geofabrik
# 2. Filters files for substations and lines
# 3. Process and clean data
# 4. Exports to CSV
# 5. Exports to GeoJson

"""
OSM extraction scrpt
"""

import os
import sys

# IMPORTANT: RUN SCRIPT FROM THIS SCRIPTS DIRECTORY i.e data_exploration/ TODO: make more robust
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
sys.path.append("../../scripts")

import logging
import shutil

import geopandas as gpd
import numpy as np
import pandas as pd
import requests
from esy.osmfilter import run_filter
from esy.osmfilter import Node, Relation, Way
from esy.osmfilter import osm_info as osm_info
from esy.osmfilter import osm_pickle as osm_pickle
from iso_country_codes import AFRICA_CC
from shapely.geometry import LineString, Point

logger = logging.getLogger(__name__)

# https://gitlab.com/dlr-ve-esy/esy-osmfilter/-/tree/master/


# import logging
# logging.basicConfig()
# logger=logging.getLogger(__name__)
# logger.setLevel(logging.INFO)
# logger.setLevel(logging.WARNING)

# Downloads PBF File for given Country Code


def download_pbf(country_code, update):
    """
    Downloads the pbf file from geofabrik for a given country code (see scripts/iso_country_codes.py).

    Parameters
    ----------
    country_code : str
    update : bool
        name of the network component
        update = true forces re-download of files
    """
    country_name = AFRICA_CC[country_code]
    # Filename for geofabrik
    geofabrik_filename = f"{country_name}-latest.osm.pbf"
    # https://download.geofabrik.de/africa/nigeria-latest.osm.pbf
    geofabrik_url = f"https://download.geofabrik.de/africa/{geofabrik_filename}"
    PBF_inputfile = os.path.join(
        os.getcwd(), "data", "osm", "pbf", geofabrik_filename
    )  # Input filepath

    if not os.path.exists(PBF_inputfile) or update is True:
        print(f"{geofabrik_filename} does not exist, downloading to {PBF_inputfile}")
        #  create data/osm directory
        os.makedirs(os.path.dirname(PBF_inputfile), exist_ok=True)
        with requests.get(geofabrik_url, stream=True) as r:
            with open(PBF_inputfile, "wb") as f:
                shutil.copyfileobj(r.raw, f)

    return PBF_inputfile


def download_and_filter(country_code, update=False):
    PBF_inputfile = download_pbf(country_code, update)

    filter_file_exists = False
    # json file for the Data dictionary
    JSON_outputfile = os.path.join(
        os.getcwd(), "data", "osm", country_code + "_power.json"
    )  # json file for the Elements dictionary is automatically written to "data/osm/Elements"+filename)

    if os.path.exists(JSON_outputfile):
        filter_file_exists = True

    # Load Previously Pre-Filtered Files
    if update is False and filter_file_exists is True:
        create_elements = False  # Do not create elements again
        new_prefilter_data = False  # Do not pre-filter data again
        # HACKY: esy.osmfilter code to re-create Data.pickle
        Data = osm_info.ReadJason(JSON_outputfile, verbose="no")
        DataDict = {"Data": Data}
        osm_pickle.picklesave(
            DataDict,
            os.path.realpath(
                os.path.join(os.getcwd(), os.path.dirname(JSON_outputfile))
            ),
        )
        print(f"Loading Pickle for {AFRICA_CC[country_code]}")  # TODO: Change to Logger
    else:
        create_elements = True
        new_prefilter_data = True
        print(
            f"Creating  New Elements for {AFRICA_CC[country_code]}"
        )  # TODO: Change to Logger

    prefilter = {
        Node: {"power": ["substation", "line", "generator"]},
        Way: {"power": ["substation", "line", "generator"]},
        Relation: {"power": ["substation", "line", "generator"]},
    }  # see https://dlr-ve-esy.gitlab.io/esy-osmfilter/filter.html for filter structures
    # HACKY: due to esy.osmfilter validation

    blackfilter = [
        ("", ""),
    ]

    for feature in ["substation", "line", "generator"]:
        whitefilter = [
            [
                ("power", feature),
            ],
        ]
        elementname = f"{country_code}_{feature}s"

        feature_data = run_filter(
            elementname,
            PBF_inputfile,
            JSON_outputfile,
            prefilter,
            whitefilter,
            blackfilter,
            NewPreFilterData=new_prefilter_data,
            CreateElements=create_elements,
            LoadElements=True,
            verbose=False,
            multiprocess=True,
        )

        if feature == "substation":
            substation_data = feature_data
        if feature == "line":
            line_data = feature_data
        if feature == "generator":
            generator_data = feature_data

    return (substation_data, line_data, generator_data)


# Convert Ways to Point Coordinates


# TODO: Use shapely and merge with convert_ways_lines
def convert_ways_nodes(df_way, Data):
    lonlat_column = []
    col = "refs"
    df_way[col] = (
        pd.Series().astype(float) if col not in df_way.columns else df_way[col]
    )  # create empty "refs" if not in dataframe
    for ref in df_way["refs"]:
        lonlats = []
        for r in ref:
            lonlat = Data["Node"][str(r)]["lonlat"]
            lonlats.append(lonlat)
        lonlats = np.array(lonlats)
        lonlat = np.mean(lonlats, axis=0)  # Hacky Apporx Centroid
        lonlat_column.append(lonlat)
    df_way.drop("refs", axis=1, inplace=True, errors="ignore")
    df_way.insert(0, "lonlat", lonlat_column)


# Convert Ways to Line Coordinates


def convert_ways_lines(df_way, Data):
    lonlat_column = []
    for ref in df_way["refs"]:  # goes through each row in df_way["refs"]
        lonlats = []
        # picks each element in ref & replaces ID by coordinate tuple (A multiline consist of several points)
        for r in ref:
            # "r" is the ID in Data["Node"], ["lonlat"] a list of [x1,y1] (coordinates)
            lonlat = Data["Node"][str(r)]["lonlat"]
            lonlat = tuple(lonlat)
            lonlats.append(lonlat)  # a list with tuples
        lonlat_column.append(lonlats)  # adding a new list of tuples every row
    df_way.drop("refs", axis=1, inplace=True)
    df_way.insert(1, "lonlat", lonlat_column)


# Convert Points Pandas Dataframe to GeoPandas Dataframe


def convert_pd_to_gdf(df_way):
    gdf = gpd.GeoDataFrame(df_way, geometry=[Point(x, y) for x, y in df_way.lonlat], crs="EPSG:4326")
    gdf.drop(columns=["lonlat"], inplace=True)
    return gdf


# Convert Lines Pandas Dataframe to GeoPandas Dataframe


def convert_pd_to_gdf_lines(df_way, simplified=False):
    df_way["geometry"] = df_way["lonlat"].apply(lambda x: LineString(x))
    if simplified is True:
        df_way["geometry"] = df_way["geometry"].apply(
            lambda x: x.simplify(0.005, preserve_topology=False)
        )
    gdf = gpd.GeoDataFrame(df_way, geometry="geometry", crs="EPSG:4326")
    gdf.drop(columns=["lonlat"], inplace=True)

    return gdf


# Convert Filtered Data, Elements to Pandas Dataframes


def convert_filtered_data_to_dfs(country_code, feature_data, feature):
    [Data, Elements] = feature_data
    elementname = f"{country_code}_{feature}s"
    df_way = pd.json_normalize(Elements[elementname]["Way"].values())
    df_node = pd.json_normalize(Elements[elementname]["Node"].values())
    return (df_node, df_way, Data)


def process_substation_data(country_code, substation_data):
    df_node, df_way, Data = convert_filtered_data_to_dfs(
        country_code, substation_data, "substation"
    )
    convert_ways_nodes(df_way, Data)
    # Add Type Column
    df_node["Type"] = "Node"
    df_way["Type"] = "Way"

    df_combined = pd.concat([df_node, df_way], axis=0)
    # Add Country Column
    df_combined["Country"] = AFRICA_CC[country_code]

    return df_combined


def process_line_data(country_code, line_data):
    df_node, df_way, Data = convert_filtered_data_to_dfs(
        country_code, line_data, "line"
    )
    convert_ways_lines(df_way, Data)
    # Add Type Column
    df_way["Type"] = "Way"

    # Add Country Column
    df_way["Country"] = AFRICA_CC[country_code]
    return df_way


def process_generator_data(country_code, generator_data):
    df_node, df_way, Data = convert_filtered_data_to_dfs(
        country_code, generator_data, "generator"
    )
    convert_ways_nodes(df_way, Data)
    # Add Type Column
    df_node["Type"] = "Node"
    df_way["Type"] = "Way"

    df_combined = pd.concat([df_node, df_way], axis=0)
    # Add Country Column
    df_combined["Country"] = AFRICA_CC[country_code]

    return df_combined


def process_data():
    df_all_substations = pd.DataFrame()
    df_all_lines = pd.DataFrame()
    df_all_generators = pd.DataFrame()
    test_CC = {"NG": "nigeria"}
    for country_code in test_CC.keys():
        substation_data, line_data, generator_data = download_and_filter(country_code)
        for feature in ["substation", "line", "generator"]:
            if feature == "substation":
                df_substation = process_substation_data(country_code, substation_data)
                df_all_substations = pd.concat([df_all_substations, df_substation])
            if feature == "line":
                df_line = process_line_data(country_code, line_data)
                df_all_lines = pd.concat([df_all_lines, df_line])
            if feature == "generator":
                df_generator = process_generator_data(country_code, generator_data)
                df_all_generators = pd.concat([df_all_generators, df_generator])

    # ----------- SUBSTATIONS -----------

    # Columns of interest
    df_all_substations = df_all_substations[
        df_all_substations.columns &
        [
            "id",
            "lonlat",
            "tags.power",
            "tags.substation",
            "tags.voltage",
            "tags.frequency",
            "Type",
            "Country",
        ]
    ]
    df_all_substations.drop(df_all_substations.loc[df_all_substations['tags.substation']=='industrial'].index, inplace=True) # Drop industrial substations
    df_all_substations.drop(df_all_substations.loc[df_all_substations['tags.substation']=='distribution'].index, inplace=True) # Drop distribution substations

    # Generate Files
    outputfile_partial = os.path.join(
        os.getcwd(), "data", "africa_all" + "_substations."
    )
    df_all_substations.to_csv(outputfile_partial + "csv")  # Generate CSV
    gdf_substations = convert_pd_to_gdf(df_all_substations)
    gdf_substations.to_file(
        outputfile_partial + "geojson", driver="GeoJSON"
    )  # Generate GeoJson

    # ----------- LINES -----------

    # Columns of interest
    df_all_lines = df_all_lines[
        df_all_lines.columns &
        [
            "id",
            "lonlat",
            "tags.power",
            "tags.cables",
            "tags.voltage",
            "tags.circuits",
            "tags.frequency",
            "Type",
            "Country",
        ]
    ]
    # Generate Files
    outputfile_partial = os.path.join(os.getcwd(), "data", "africa_all" + "_lines.")
    df_all_lines.to_csv(outputfile_partial + "csv")  # Generate CSV
    gdf_lines = convert_pd_to_gdf_lines(df_all_lines, simplified=True)
    gdf_lines.to_file(
        outputfile_partial + "geojson", driver="GeoJSON"
    )  # Generate GeoJson

    # ----------- Generator -----------

    # Columns of interest
    df_all_generators = df_all_generators[
        df_all_generators.columns &
        [
            "id",
            "lonlat",
            "tags.power",
            "tags.generator:type",
            "tags.generator:method",
            "tags.generator:source",
            "tags.generator:output:electricity",
            "Type",
            "Country",
        ]
    ]
    # Generate Files
    outputfile_partial = os.path.join(
        os.getcwd(), "data", "africa_all" + "_generators."
    )
    df_all_generators.to_csv(outputfile_partial + "csv")  # Generate CSV
    gdf_generators = convert_pd_to_gdf(df_all_generators)
    gdf_generators.to_file(
        outputfile_partial + "geojson", driver="GeoJSON"
    )  # Generate GeoJson


# Overwrite Functions

In [127]:
# New function

def lonlat_lookup(df_way, Data):
    lonlat_list = []

    col = "refs"
    if col not in df_way.columns:
        print ("refs column not found")
        df_way[col] = pd.Series().astype(float) # create empty "refs" if not in dataframe
      
    for ref in df_way["refs"]:
        lonlat_row = []
        for r in ref:
            lonlat = tuple(Data["Node"][str(r)]["lonlat"])
            lonlat_row.append(lonlat)
        lonlat_list.append(lonlat_row)
    return lonlat_list

from shapely.geometry import Polygon

def convert_ways_point(df_way, Data):
    lonlat_list = lonlat_lookup(df_way, Data)
    lonlat_column = []
    area_column = []
    for lonlat in lonlat_list:
        way_polygon = Polygon(lonlat)
        polygon_area = int(round(gpd.GeoSeries(way_polygon).set_crs("EPSG:4326").to_crs("EPSG:3857").area, -1)) # nearest tens
        # print('{:g}'.format(float('{:.3g}'.format(float(polygon_area))))) # For significant numbers
        area_column.append(polygon_area)
        center_point = way_polygon.centroid
        lonlat_column.append(list((center_point.x, center_point.y)))

    # df_way.drop("refs", axis=1, inplace=True, errors="ignore")
    df_way.insert(0, "lonlat", lonlat_column)
    

In [44]:


# TODO: Use shapely and merge with convert_ways_lines
def convert_ways_nodes(df_way, Data):
    convert_ways_point(df_way, Data)
    # lonlat_column = []
    # col = "refs"
    # df_way[col] = (
    #     pd.Series().astype(float) if col not in df_way.columns else df_way[col]
    # )  # create empty "refs" if not in dataframe
    # for ref in df_way["refs"]:
    #     lonlats = []
    #     for r in ref:
    #         lonlat = Data["Node"][str(r)]["lonlat"]
    #         lonlats.append(lonlat)
    #     lonlats = np.array(lonlats)
    #     lonlat = np.mean(lonlats, axis=0)  # Hacky Apporx Centroid
    #     lonlat_column.append(lonlat)
    # # df_way.drop("refs", axis=1, inplace=True, errors="ignore")
    # df_way.insert(0, "lonlat", lonlat_column)

In [43]:
# Convert Ways to Line Coordinates


# def convert_ways_lines(df_way, Data):
#     lonlat_column = []
#     for ref in df_way["refs"]:  # goes through each row in df_way["refs"]
#         lonlats = []
#         # picks each element in ref & replaces ID by coordinate tuple (A multiline consist of several points)
#         for r in ref:
#             # "r" is the ID in Data["Node"], ["lonlat"] a list of [x1,y1] (coordinates)
#             lonlat = Data["Node"][str(r)]["lonlat"]
#             lonlat = tuple(lonlat)
#             lonlats.append(lonlat)  # a list with tuples
#         lonlat_column.append(lonlats)  # adding a new list of tuples every row
#     df_way.drop("refs", axis=1, inplace=True)
#     df_way.insert(1, "lonlat", lonlat_column)


def convert_ways_lines(df_way, Data):
    lonlat_column = lonlat_lookup(df_way, Data)
    df_way.insert(1, "lonlat", lonlat_column)

In [60]:
def process_substation_data(country_code, substation_data):
    df_node, df_way, Data = convert_filtered_data_to_dfs(
        country_code, substation_data, "substation"
    )
    convert_ways_nodes(df_way, Data)
    # Add Type Column
    df_node["Type"] = "Node"
    df_way["Type"] = "Way"

    df_combined = pd.concat([df_node, df_way], axis=0)
    # Add Country Column
    df_combined["Country"] = AFRICA_CC[country_code]

    return df_combined

In [126]:
df_all_substations = pd.DataFrame()
df_all_lines = pd.DataFrame()
df_all_generators = pd.DataFrame()
test_CC = {"NG": "nigeria"}
for country_code in test_CC.keys():
    substation_data, line_data, generator_data = download_and_filter(country_code)
    for feature in ["substation", "line","generator"]:
        if feature == 'substation':
            df_substation = process_substation_data(country_code, substation_data)
            df_all_substations = pd.concat(
                [df_all_substations, df_substation])
        if feature == 'line':
            df_line = process_line_data(country_code, line_data)
            df_all_lines = pd.concat([df_all_lines, df_line])
        if feature == 'generator':
            df_generator = process_generator_data(country_code, generator_data)
            df_all_generators = pd.concat(
                [df_all_generators, df_generator])

Loading Pickle for nigeria
23020
22970
4260
5990
77640
3730
93820
2710
15000
63770
68100
5800
13100
42720
32260
150
1590
16310
254280
77260
7560
13050
46840
5570
12910
69450
2320
34810
43040
3540
9590
29330
6310
72420
17110
57530
12060
4300
12190
35480
44280
7900
960
83800
14190
13240
4820
77480
450
51110
7130
2140
3000
42300
85620
10170
22300
12220
12130
20620
22850
42820
2630
13680
4900
1760
9130
123440
53840
7680
11920
2460
47600
32850
62140
195350
37590
23990
3520
5270
9430
8020
1620
210
4090
840
26320
58130
41150
243330
45410
8640
17470
17320
12320
3580
13550
2750
22470
8490
96060
1560
2620
118790
2560
34430
16460
2310
6040
11680
15960
10600
11520
3770
14550
88970
1360
10630
16760
12440
44920
6490
6040
102780
19800
76830
214840
13590
15210
14440
16310
70340
57550
3550
56980
7260
13060
54300
9140
5880
10290
72890
44380
20060
7090
25650
6930
13260
2210
13000
46320
7890
12540
28300
410
32570
51750
25270
2920
10040
61520
15710
19930
3200
16010
146670
10620
7860
150
30
450
170
170
240


# Substations

In [129]:
#----------- SUBSTATIONS -----------

# Clean
df_all_substations.reset_index(drop=True, inplace=True)
df_all_substations.dropna(thresh=len(df_all_substations)*0.25, axis=1, how='all', inplace = True) #Drop Columns with 75% values as N/A
df_all_substations.dropna(subset=['tags.voltage'], inplace = True) # Drop any substations with Voltage = N/A
df_all_substations.drop(df_all_substations.loc[df_all_substations['tags.substation']=='industrial'].index, inplace=True)
df_all_substations.drop(df_all_substations.loc[df_all_substations['tags.substation']=='distribution'].index, inplace=True)

# Generate Files
outputfile_partial = os.path.join(os.getcwd(),'data','africa_all'+'_substations.')
df_all_substations.to_csv(outputfile_partial + 'csv') # Generate CSV
gdf_substations = convert_pd_to_gdf(df_all_substations.drop('refs', 1))
gdf_substations.to_file(outputfile_partial+'geojson', driver="GeoJSON")  # Generate GeoJson

In [94]:
df_all_substations

Unnamed: 0,id,lonlat,tags.power,tags.substation,Type,refs,tags.voltage,tags.location,Country
0,2000011529,"[5.580595199999979, 6.357239900000047]",substation,,Node,,,,nigeria
1,2161137620,"[5.612709500000012, 6.298418899999993]",substation,,Node,,,,nigeria
2,2161121796,"[5.605644800000013, 6.292686199999994]",substation,,Node,,,,nigeria
3,5604720218,"[13.140950299999984, 11.811399999999967]",substation,,Node,,,,nigeria
4,3605463946,"[7.054147699999979, 4.792176100000002]",substation,distribution,Node,,,,nigeria
...,...,...,...,...,...,...,...,...,...
177,553840946,"[7.407979286971628, 6.430890898645915]",substation,,Way,"[5345637324, 5345637323, 5345637322, 534563622...",,,nigeria
178,745794408,"[3.5060829546752417, 6.625048358398509]",substation,,Way,"[6980276932, 6980276933, 6980276934, 698027693...",,,nigeria
179,108020974,"[3.247914379008802, 6.603102220631313]",substation,transmission,Way,"[1240204635, 1240204684, 1240204688, 306400478...",330000,outdoor,nigeria
180,564766375,"[6.328447635850983, 4.990186628300639]",substation,,Way,"[5440610911, 5440610910, 5440610909, 544061090...",,,nigeria


# Generators

In [35]:
# ----------- Generator -----------

df_all_generators.reset_index(drop=True, inplace=True)
df_all_generators.drop(columns = ["tags.fixme","tags.frequency","tags.name:ar","tags.building","tags.barrier"], inplace = True, errors='ignore')
df_all_generators = df_all_generators[df_all_generators['tags.generator:output:electricity'].astype(str).str.contains('MW')] #removes boolean 
df_all_generators['tags.generator:output:electricity'] = df_all_generators['tags.generator:output:electricity'].str.extract('(\d+)').astype(float)
df_all_generators.rename(columns = {'tags.generator:output:electricity':"power_output_MW"}, inplace = True)
# df_all_generators.dropna(thresh=len(df_all_generators)*0.25, axis=1, how='all', inplace=True) # Drop Columns with 75% values as N/A

# # Generate Files
# outputfile_partial = os.path.join(os.getcwd(),'data','africa_all'+'_generators.')
# df_all_generators.to_csv(outputfile_partial + 'csv') # Generate CSV
# gdf_generators = convert_pd_to_gdf(df_all_generators)
# gdf_generators.to_file(outputfile_partial+'geojson', driver="GeoJSON")  # Generate GeoJson

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [36]:
df_all_generators

Unnamed: 0,id,lonlat,tags.name,tags.power,tags.source,tags.generator:type,tags.generator:method,tags.generator:source,power_output_MW,tags.operator,tags.description,tags.manufacturer,tags.start_date,Type,refs,Country
5,7805160134,"[6.66302290000007, 5.39017029999977]",,generator,,gas_turbine,combustion,gas,25.0,,,,,Node,,nigeria
11,7808382697,"[5.645223299999995, 5.924997999999995]",,generator,http://nerp.abv.ng/index.php/power-plants/sape...,steam_turbine,combustion,gas,120.0,Sapele Power,,,,Node,,nigeria
12,7804096584,"[4.78933260000001, 9.139945099999972]",,generator,,kaplan_turbine,run-of-the-river,hydro,96.0,,,,,Node,,nigeria
14,7804096571,"[5.91642560000001, 5.54205519999997]",,generator,,gas_turbine,combustion,gas,25.0,,,,,Node,,nigeria
16,7805160138,"[6.66355510000007, 5.390166099999769]",,generator,,gas_turbine,combustion,gas,25.0,,,,,Node,,nigeria
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,7808382705,"[5.6440999999999955, 5.927484399999995]",,generator,http://nerp.abv.ng/index.php/power-plants/sape...,gas_turbine,combustion,gas,75.0,Sapele Power,,,,Node,,nigeria
302,7808377978,"[8.272184699999995, 5.190881999999995]",,generator,,gas_turbine,combustion,gas,112.0,,,,,Node,,nigeria
306,215459541,"[6.658849819999984, 7.469320879999995]",GT13,generator,,,combustion,gas,138.0,,,,,Way,"[2249105432, 2249105417, 2249105438, 224910544...",nigeria
340,215459539,"[6.659025679999987, 7.469613599999995]",GT12,generator,,,combustion,gas,138.0,,,,,Way,"[2249105467, 2249105457, 2249105469, 224910547...",nigeria


# Lines

In [9]:
# ----------- LINES -----------

# Clean
# TODO: FIX Voltage Filter
# Some transmission lines carry multiple voltages, having voltage_V = 10000;20000  (two lines)
# The following code keeps only the first information before the semicolon..
# Needs to be corrected in future, creating two lines with the same bus ID.


df_all_lines = df_all_lines[
        {
            "id",
            "refs",
            "lonlat",
            "tags.power",
            "tags.cables",
            "tags.voltage",
            "tags.frequency",
            "Type",
            "Country",
        }
    ]

# Clean data    
df_all_lines = df_all_lines.reset_index(drop=True)
df_all_lines = df_all_lines.dropna(subset=['tags.voltage']) # Drop any lines with Voltage = N/A
df_all_lines = df_all_lines.rename(columns = {'tags.voltage':"voltage_V"}) 
df_all_lines['voltage_V'] = df_all_lines['voltage_V'].str.split('*').str[0] #just keeps the 
df_all_lines['voltage_V'] = df_all_lines['voltage_V'].str.split(';').str[0]
df_all_lines['voltage_V'] = df_all_lines['voltage_V'].apply(lambda x: pd.to_numeric(x, errors='coerce')).dropna() ## if cell can't converted to float -> drop
df_all_lines = df_all_lines[df_all_lines.voltage_V > 10000]
# df_all_lines['end_refs'] = 

# Generate Files
outputfile_partial = os.path.join(os.getcwd(), 'data', 'africa_all'+'_lines.')  
df_all_lines.to_csv(outputfile_partial + 'csv')  # Generate CSV
gdf_lines = convert_pd_to_gdf_lines(df_all_lines.drop('refs', 1), simplified=True)
gdf_lines.to_file(outputfile_partial+'geojson',
            driver="GeoJSON")  # Generate GeoJson

In [10]:
df_line_lookup = df_all_lines[['id','refs']]
display(df_line_lookup)

Unnamed: 0,id,refs
2,556530547,"[2725719269, 2725912800, 2725719240, 272571923..."
3,671706473,"[6290115060, 6290115059, 6290115058, 629011505..."
6,108020973,"[5198712394, 3064004782, 3064004786, 306400479..."
7,671606207,"[5282666532, 6289294708, 5282666526]"
8,575700599,"[5448403687, 5448403686]"
...,...,...
647,563719558,"[5432136688, 5432136687, 5432136686, 543213668..."
649,565151711,"[5443838099, 5443838095, 5443838091, 544383809..."
650,669957218,"[6274221446, 6274221432, 6274221440, 627422144..."
651,564385545,"[5437840807, 6291806172, 6291806173, 629180617..."


In [11]:
df_node_lookup = df_all_substations[['id','refs']]
df_node_lookup = df_node_lookup.dropna(subset=['refs']) # Drop any nodes with refs = N/A
display(df_node_lookup)

Unnamed: 0,id,refs
14,564755236,"[5440518752, 5440518751, 5440518750, 544051874..."
15,567535340,"[5462602236, 5462602234, 5462602233, 546260223..."
16,592191538,"[5652509019, 5652509020, 5652509121, 565250912..."
17,564458980,"[5438478937, 5438478936, 5438478935, 543847893..."
18,655273140,"[6139631838, 6139631837, 6139631836, 613963183..."
...,...,...
177,553840946,"[5345637324, 5345637323, 5345637322, 534563622..."
178,745794408,"[6980276932, 6980276933, 6980276934, 698027693..."
179,108020974,"[1240204635, 1240204684, 1240204688, 306400478..."
180,564766375,"[5440610911, 5440610910, 5440610909, 544061090..."


In [12]:
def flatten_df(df):
    out = []
    for n, row in df.iterrows():
        for item in row['refs']:
            row['flat_ref'] = item
            out += [row.copy()]

    flattened_df = pd.DataFrame(out)
    flattened_df.reset_index(drop=True, inplace=True)

    return flattened_df

In [13]:
way_f = flatten_df(df_line_lookup)
display(way_f)

Unnamed: 0,id,refs,flat_ref
0,556530547,"[2725719269, 2725912800, 2725719240, 272571923...",2725719269
1,556530547,"[2725719269, 2725912800, 2725719240, 272571923...",2725912800
2,556530547,"[2725719269, 2725912800, 2725719240, 272571923...",2725719240
3,556530547,"[2725719269, 2725912800, 2725719240, 272571923...",2725719238
4,556530547,"[2725719269, 2725912800, 2725719240, 272571923...",5368625277
...,...,...,...
32133,695800745,"[6533692645, 6533692646, 6533692647, 653369264...",6533703128
32134,695800745,"[6533692645, 6533692646, 6533692647, 653369264...",6533703129
32135,695800745,"[6533692645, 6533692646, 6533692647, 653369264...",6533703130
32136,695800745,"[6533692645, 6533692646, 6533692647, 653369264...",6533703131


In [14]:
node_f = flatten_df(df_node_lookup)
display(node_f)

Unnamed: 0,id,refs,flat_ref
0,564755236,"[5440518752, 5440518751, 5440518750, 544051874...",5440518752
1,564755236,"[5440518752, 5440518751, 5440518750, 544051874...",5440518751
2,564755236,"[5440518752, 5440518751, 5440518750, 544051874...",5440518750
3,564755236,"[5440518752, 5440518751, 5440518750, 544051874...",5440518747
4,564755236,"[5440518752, 5440518751, 5440518750, 544051874...",5440518748
...,...,...,...
1231,567184002,"[5460459805, 5460459804, 5460459803, 546045980...",5460459805
1232,567184002,"[5460459805, 5460459804, 5460459803, 546045980...",5460459804
1233,567184002,"[5460459805, 5460459804, 5460459803, 546045980...",5460459803
1234,567184002,"[5460459805, 5460459804, 5460459803, 546045980...",5460459802


In [15]:
df3 = way_f.merge(node_f, on='flat_ref', how='inner')
display(df3)

Unnamed: 0,id_x,refs_x,flat_ref,id_y,refs_y
0,553772749,"[5345033041, 5432224011, 5345036235, 534503304...",5345033046,553772750,"[5345033057, 5345033048, 5345033049, 534503305..."


In [16]:
all_line_refs = set()
for ref in df_all_lines["refs"]:  # goes through each row in df_way['refs']
    for r in ref:
        all_line_refs.add(r)



In [17]:
all_st_refs = set()
for ref in df_all_substations.dropna(subset=['refs'])["refs"]:  # goes through each row in df_way['refs']
    for r in ref:
        all_st_refs.add(r)

In [18]:
# all_line_refs

In [19]:
all_line_refs.intersection(all_st_refs)

{5345033046}