In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json

In [3]:
dataset_file_path = r"C:\Users\TARUN\Desktop\SDG16_Dataset_With_Roles_Updated.csv"
df = pd.read_csv(dataset_file_path, header=0, low_memory=False, encoding='utf-8').reset_index(drop=True)

# Get the directory path
directory_path = os.path.dirname(dataset_file_path)
print("Directory path:", directory_path)

# Check missing values
print("\nMissing values per column:\n", df.isnull().sum())


Directory path: C:\Users\TARUN\Desktop

Missing values per column:
 Country           0
Indicator         0
Indicator_Type    0
SeriesCode        0
Units             0
2000              0
2001              0
2002              0
2003              0
2004              0
2005              0
2006              0
2007              0
2008              0
2009              0
2010              0
2011              0
2012              0
2013              0
2014              0
2015              0
2016              0
2017              0
2018              0
2019              0
2020              0
2021              0
2022              0
2023              0
Role              0
dtype: int64


In [4]:
# Create a mapping for countries and indicators to unique numeric codes
country_code_map = {country: idx for idx, country in enumerate(df['Country'].unique())}
indicator_code_map = {indicator: idx for idx, indicator in enumerate(df['Indicator'].unique())}

# Add the codes to the dataframe
df['country_code'] = df['Country'].map(country_code_map)
df['indicator_code'] = df['Indicator'].map(indicator_code_map)

In [5]:
# Print original shape
print("Original shape:")
print(df.shape)

# Drop duplicate rows (across all columns)
df = df.drop_duplicates()

# Print new shape after dropping duplicates
print("New shape after dropping duplicates:")
print(df.shape)


Original shape:
(756, 32)
New shape after dropping duplicates:
(756, 32)


In [6]:
import pycountry
from geopy.geocoders import ArcGIS
from geopy.extra.rate_limiter import RateLimiter
import pandas as pd
import os

# Step 1: Load Dataset
dataset_file_path = r"C:\Users\TARUN\Desktop\SDG16_Dataset_With_Roles_Updated.csv"
df = pd.read_csv(dataset_file_path, header=0, low_memory=False, encoding='utf-8').reset_index(drop=True)

# --------------------------------------------
# Step 2: Define Helper Functions
# --------------------------------------------
def remove_third_element(point):
    if isinstance(point, tuple):
        return point[:2]
    else:
        return point

def get_code(name):
    try:
        return pycountry.countries.lookup(name).alpha_2
    except LookupError:
        return None

# --------------------------------------------
# Step 3: Setup Geocoder
# --------------------------------------------
geolocator = ArcGIS(user_agent="SDG16-Encoder")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def get_location(row, country_col, code_col):
    country_name = row[country_col]
    country_code = row[code_col]
    if country_code:
        return geocode(f"{country_name}, {country_code}")
    else:
        return geocode(country_name)

# --------------------------------------------
# Step 4: Geocode Unique Countries
# --------------------------------------------
unique_countries = pd.DataFrame(df['Country'].unique(), columns=['Country'])
unique_countries['country_code'] = unique_countries['Country'].apply(get_code)
unique_countries['location'] = unique_countries.apply(lambda row: get_location(row, 'Country', 'country_code'), axis=1)
unique_countries['point'] = unique_countries['location'].apply(lambda loc: (loc.latitude, loc.longitude) if loc else None)
unique_countries['point'] = unique_countries['point'].apply(remove_third_element)
unique_countries['latitude'] = unique_countries['point'].apply(lambda x: x[0] if x else None)
unique_countries['longitude'] = unique_countries['point'].apply(lambda x: x[1] if x else None)

# --------------------------------------------
# Step 5: Drop Existing Geo Columns if Present
# --------------------------------------------
df = df.drop(columns=['country_code', 'latitude', 'longitude'], errors='ignore')

# --------------------------------------------
# Step 6: Merge Geocoded Data Back to Main DataFrame
# --------------------------------------------
df = df.merge(unique_countries[['Country', 'location', 'country_code', 'latitude', 'longitude', 'point']], on='Country', how='left')

# --------------------------------------------
# Step 7: Preview
# --------------------------------------------
print(df[['Country', 'country_code', 'latitude', 'longitude']].drop_duplicates().head())
df.head()


          Country country_code   latitude   longitude
0     Afghanistan           AF  33.831137   66.024712
1         Albania           AL  41.134553   20.064206
2         Algeria           DZ  28.144114    2.679966
3  American Samoa           AS -14.300688 -170.718116
4         Andorra           AD  42.545303    1.576286


Unnamed: 0,Country,Indicator,Indicator_Type,SeriesCode,Units,2000,2001,2002,2003,2004,...,2020,2021,2022,2023,Role,location,country_code,latitude,longitude,point
0,Afghanistan,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,0.0,0.0,0.0,0.0,0.0,...,6.59,4.02,0.0,0.0,Not Assigned,"(Afghanistan, (33.831137065, 66.024711797))",AF,33.831137,66.024712,"(33.831137065, 66.024711797)"
1,Albania,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,12.41,21.01,20.83,16.17,12.87,...,6.39,6.94,4.96,0.0,Not Assigned,"(Albania, (41.134553284, 20.064206431))",AL,41.134553,20.064206,"(41.134553284, 20.064206431)"
2,Algeria,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,0.0,0.0,0.0,0.0,0.0,...,4.48,4.69,5.27,0.0,Not Assigned,"(Algeria, (28.144113769, 2.679965933))",DZ,28.144114,2.679966,"(28.144113769, 2.679965933)"
3,American Samoa,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,0.0,1.71,12.03,5.18,6.94,...,0.0,0.0,0.0,0.0,Not Assigned,"(American Samoa, (-14.30068806, -170.718116122))",AS,-14.300688,-170.718116,"(-14.30068806, -170.718116122)"
4,Andorra,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,0.0,0.0,0.0,0.0,1.3,...,7.62,0.0,0.0,0.0,Not Assigned,"(Andorra, (42.545303201, 1.576286302))",AD,42.545303,1.576286,"(42.545303201, 1.576286302)"


In [7]:
import os

# Step 1: Choose directory to save in
directory_path = r"C:\Users\TARUN\Desktop"

# Step 2: Define the output path
geo_path = os.path.join(directory_path, 'GeoLocationInfo_SDG16.csv')

# Step 3: Save the updated geo-encoded DataFrame
df.to_csv(geo_path, index=False, encoding='utf-8')

print(f"GeoLocationInfo_SDG16.csv saved for the NEW dataset at:\n{geo_path}")

GeoLocationInfo_SDG16.csv saved for the NEW dataset at:
C:\Users\TARUN\Desktop\GeoLocationInfo_SDG16.csv


In [8]:
import pandas as pd

# Step 1: Load the previously saved full GeoLocationInfo file
geo_df = pd.read_csv( r"C:\Users\TARUN\Desktop\GeoLocationInfo_SDG16.csv")

# Step 1.5: Re-create indicator_code (because it might be missing earlier)
indicator_code_map = {indicator: idx for idx, indicator in enumerate(geo_df['Indicator'].unique())}
geo_df['indicator_code'] = geo_df['Indicator'].map(indicator_code_map)

# Step 2: Specify the columns to retain
columns_to_keep = [
    'Country', 'Indicator', 'Indicator_Type', 'SeriesCode', 'Units',
    '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
    '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
    '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023',
    'country_code', 'indicator_code', 'location', 'point',
    'latitude', 'longitude', 'Role'
]

# Step 3: Filter and select only these columns
selected_geo_df = geo_df[columns_to_keep]

# Step 4: Save the final cleaned file
final_path = r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final_SDG16.csv"
selected_geo_df.to_csv(final_path, index=False, encoding='utf-8')

# Step 5: Display the result
print(selected_geo_df.head())
print(f"\nFinal cleaned file saved at: {final_path}")


          Country Indicator Indicator_Type   SeriesCode           Units  \
0     Afghanistan    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
1         Albania    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
2         Algeria    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
3  American Samoa    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
4         Andorra    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   

    2000   2001   2002   2003   2004  ...  2021  2022  2023  country_code  \
0   0.00   0.00   0.00   0.00   0.00  ...  4.02  0.00   0.0            AF   
1  12.41  21.01  20.83  16.17  12.87  ...  6.94  4.96   0.0            AL   
2   0.00   0.00   0.00   0.00   0.00  ...  4.69  5.27   0.0            DZ   
3   0.00   1.71  12.03   5.18   6.94  ...  0.00  0.00   0.0            AS   
4   0.00   0.00   0.00   0.00   1.30  ...  0.00  0.00   0.0            AD   

   indicator_code        location                           point   latitude  \
0     

In [9]:
import pandas as pd
import pycountry

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final_SDG16.csv") 

# Step 2: Create a correction mapping for problematic country names
country_name_fixes = {
    "Bolivia (Plurinational State of)": "Bolivia",
    "China, Hong Kong Special Administrative Region": "China",
    "China, Macao Special Administrative Region": "China",
    "Iran (Islamic Republic of)": "Iran",
    "Micronesia (Federated States of)": "Federated States of Micronesia",
    "Republic of Korea": "South Korea",
    "Republic of Moldova": "Moldova",
    "Russian Federation": "Russia",
    "Syrian Arab Republic": "Syria",
    "United Republic of Tanzania": "Tanzania",
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "Viet Nam": "Vietnam",
    "Kosovo": "XK",  
    "Holy See": "VA",  
    "Iraq (Central Iraq)": "Iraq",
    "Iraq (Kurdistan Region)": "Iraq",
    "Netherlands (Kingdom of the)": "Netherlands",
    "State of Palestine": "PS",
    "Saint Helena": "SH",
    "United Kingdom (England and Wales)": "United Kingdom",
    "United Kingdom (Northern Ireland)": "United Kingdom",
    "United Kingdom (Scotland)": "United Kingdom",
    "United States Virgin Islands": "Virgin Islands, U.S.",
    "Democratic Republic of the Congo": "Congo, The Democratic Republic of the",
    "Channel Islands": "Jersey",
    "Wallis and Futuna Islands": "Wallis and Futuna",
    "Other non-specified areas in Eastern Asia": "OTH"  # Special case
}

# Step 3: Define a safe lookup function
def safe_country_code(name):
    if name in country_name_fixes:
        fixed = country_name_fixes[name]
        if len(fixed) == 2:  # If already ISO 2-letter code
            return fixed
        try:
            return pycountry.countries.lookup(fixed).alpha_2
        except LookupError:
            return None
    else:
        try:
            return pycountry.countries.lookup(name).alpha_2
        except LookupError:
            return None

# Step 4: Fill missing country_code values
df['country_code'] = df.apply(
    lambda row: row['country_code'] if pd.notna(row['country_code']) else safe_country_code(row['Country']),
    axis=1
)

# Step 5: Save updated DataFrame
fixed_path = r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final_Fixed_SDG16.csv"
df.to_csv(fixed_path, index=False, encoding='utf-8')

# Optional: Preview rows where country_code is still missing (to double check)
print(df[df['country_code'].isna()][['Country', 'country_code']])

                                       Country country_code
133  Other non-specified areas in Eastern Asia         None
335  Other non-specified areas in Eastern Asia         None
523  Other non-specified areas in Eastern Asia         None


In [10]:
import pandas as pd
import pycountry

# Step 1: Load the updated file
df = pd.read_csv(r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final_Fixed_SDG16.csv")

# Step 2: Manual overrides for known missing ISO entries
country_name_fixes = {
    "Democratic Republic of the Congo": "Congo, The Democratic Republic of the",
    "United States Virgin Islands": "Virgin Islands, U.S.",
    "Wallis and Futuna Islands": "Wallis and Futuna",
    "Channel Islands": "Jersey",
    "Netherlands (Kingdom of the)": "Netherlands",
    "Kosovo": "XK",
    "Holy See": "VA",
    "Micronesia (Federated States of)": "Federated States of Micronesia",
    "State of Palestine": "Palestine",
    "United Kingdom (England and Wales)": "United Kingdom",
    "United Kingdom (Northern Ireland)": "United Kingdom",
    "United Kingdom (Scotland)": "United Kingdom"
}

manual_code_override = {
    "State of Palestine": "PS",
    "Channel Islands": "JE",
    "Wallis and Futuna Islands": "WF",
    "United States Virgin Islands": "VI",
    "Kosovo": "XK",
    "Holy See": "VA"
}

# Step 3: Safe lookup function
def safe_country_code(name):
    if name in manual_code_override:
        return manual_code_override[name]
    elif name in country_name_fixes:
        fixed = country_name_fixes[name]
        if len(fixed) == 2:
            return fixed
        try:
            return pycountry.countries.lookup(fixed).alpha_2
        except LookupError:
            return None
    else:
        try:
            return pycountry.countries.lookup(name).alpha_2
        except LookupError:
            return None

# Step 4: Apply corrections where needed
df['country_code'] = df.apply(
    lambda row: row['country_code'] if pd.notna(row['country_code']) else safe_country_code(row['Country']),
    axis=1
)

# Step 5: Tag any remaining null country codes as 'REGION'
df['country_code'] = df['country_code'].fillna('REGION')

# Step 6: Save the final corrected version
final_tagged_path = r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final_Tagged_SDG16.csv"
df.to_csv(final_tagged_path, index=False, encoding='utf-8')

In [11]:
import pandas as pd
import os

# STEP 1: Load the already geoencoded dataset
geo_df = pd.read_csv(r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final_Tagged_SDG16.csv")

# STEP 2: Ensure consistent column naming
# If not already present, create a simplified "country_code" column for joining
if 'country_code' not in geo_df.columns:
    geo_df['country_code'] = pd.factorize(geo_df['Country'])[0] + 1

# STEP 3: Separate torchbearers and beneficiaries
torchbearers_df = geo_df[geo_df['Role'] == 'Torchbearer'].drop_duplicates(subset='Country')[['Country', 'latitude', 'longitude']].copy()
beneficiaries_df = geo_df[geo_df['Role'] == 'Beneficiary'].drop_duplicates(subset='Country')[['Country', 'latitude', 'longitude']].copy()

# STEP 4: Rename columns to make them distinct
torchbearers_df = torchbearers_df.rename(columns={
    'latitude': 'torchbearer_lat',
    'longitude': 'torchbearer_long'
})

beneficiaries_df = beneficiaries_df.rename(columns={
    'latitude': 'beneficiary_lat',
    'longitude': 'beneficiary_long'
})

# STEP 5: Merge coordinates back into the main dataframe
merged_df = geo_df.copy()
merged_df = merged_df.merge(torchbearers_df, on='Country', how='left')
merged_df = merged_df.merge(beneficiaries_df, on='Country', how='left')

# STEP 6: Drop duplicate rows if any were introduced
merged_df = merged_df.drop_duplicates()

# STEP 7: Assign null values with 0
merged_df.fillna(0, inplace=True)

# STEP 8: Save the result
save_path = r"C:\Users\TARUN\Desktop\GeoLocationInfo_Merged_With_Roles_SDG16.csv"
merged_df.to_csv(save_path, index=False)

# Optional: Quick check
print(merged_df.head())

          Country Indicator Indicator_Type   SeriesCode           Units  \
0     Afghanistan    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
1         Albania    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
2         Algeria    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
3  American Samoa    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
4         Andorra    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   

    2000   2001   2002   2003   2004  ...  indicator_code        location  \
0   0.00   0.00   0.00   0.00   0.00  ...               0     Afghanistan   
1  12.41  21.01  20.83  16.17  12.87  ...               0         Albania   
2   0.00   0.00   0.00   0.00   0.00  ...               0         Algeria   
3   0.00   1.71  12.03   5.18   6.94  ...               0  American Samoa   
4   0.00   0.00   0.00   0.00   1.30  ...               0         Andorra   

                            point   latitude   longitude          Role  \
0    (33.831

In [12]:
import pandas as pd
import os

# STEP 1: Load the SDG-16 geolocation-enhanced dataset
geo_path = r"C:\Users\TARUN\Desktop\GeoLocationInfo_Merged_With_Roles_SDG16.csv"
temp_df = pd.read_csv(geo_path, encoding="utf-8")

# STEP 2: Sort for consistency
temp_df = temp_df.sort_values(by=['Country', 'Role'], ascending=[True, True])

# STEP 3: Extract torchbearer and beneficiary country lists
torchbearer_countries_list = temp_df[temp_df['Role'] == 'Torchbearer']['Country'].unique()
beneficiary_countries_list = temp_df[temp_df['Role'] == 'Beneficiary']['Country'].unique()

# STEP 4: Print useful context
print("Number of torchbearer countries:", len(torchbearer_countries_list))
print("Number of beneficiary countries:", len(beneficiary_countries_list))


Number of torchbearer countries: 16
Number of beneficiary countries: 25


In [13]:
import pandas as pd
import os

# STEP 1: Load the geoencoded SDG-16 dataset
geo_path = r"C:\Users\TARUN\Desktop\GeoLocationInfo_Merged_With_Roles_SDG16.csv"
df = pd.read_csv(geo_path, encoding="utf-8")

# STEP 2: Melt the year columns (2000–2023) to long format
year_columns = [str(year) for year in range(2000, 2024)]
df_melted = df.melt(
    id_vars=[
        "Country", "Indicator", "Indicator_Type", "SeriesCode", "Units",
        "Role", "country_code", "latitude", "longitude",
        "torchbearer_lat", "torchbearer_long", "beneficiary_lat", "beneficiary_long"
    ],
    value_vars=year_columns,
    var_name="year",
    value_name="Value"
)

# STEP 3: Fill NaN with 0
df_melted.fillna(0, inplace=True)

# STEP 4: Convert year to int and create 3-year bins
df_melted["year"] = df_melted["year"].astype(int)
min_year = df_melted["year"].min()
max_year = df_melted["year"].max()

# Generate 3-year bin intervals
bin_ranges = []
current = min_year
while current <= max_year:
    end = current + 2
    bin_ranges.append((current, end))
    current += 3

def map_to_3yr_bin(year):
    for start, end in bin_ranges:
        if start <= year <= end:
            return f"{start}-{end}"
    return "Other"

df_melted["year_interval"] = df_melted["year"].apply(map_to_3yr_bin)

# STEP 5: Normalize 'Value' column per Indicator_Type
df_melted["normalized_value"] = 0.0
for indicator in df_melted["Indicator_Type"].unique():
    mask = df_melted["Indicator_Type"] == indicator
    min_val = df_melted.loc[mask, "Value"].min()
    max_val = df_melted.loc[mask, "Value"].max()
    df_melted.loc[mask, "normalized_value"] = df_melted.loc[mask, "Value"].apply(
        lambda x: 0.01 + ((x - min_val) / (max_val - min_val) * (2 - 0.01)) if max_val != min_val else 1.0
    )

# STEP 6: Assign ring_location based on year_interval
def assign_ring_locations(df_input):
    intervals = sorted(df_input["year_interval"].unique())
    ring_location_dict = {interval: -135 + i * 35 for i, interval in enumerate(intervals)}
    df_input["ring_location"] = df_input["year_interval"].map(ring_location_dict)
    return df_input

df_melted = assign_ring_locations(df_melted)

# STEP 7: Save the final normalized file
norm_path = r"C:\Users\TARUN\Desktop\Normalized_SDG16_Data_Geo_Final.csv"
df_melted.to_csv(norm_path, index=False, encoding='utf-8')

In [16]:
import pandas as pd
import time
import os

# Load normalized SDG-16 dataset
normalized_df = pd.read_csv(r"C:\Users\TARUN\Desktop\Normalized_SDG16_Data_Geo_Final.csv")

# Load node and tag templates
node_template = pd.read_csv(r"C:\Users\TARUN\Desktop\np_node-template.csv")  
tag_template = pd.read_csv(r"C:\Users\TARUN\Desktop\Tag_Template_SDG-16.csv") 
Colors = pd.read_csv(r"C:\Users\TARUN\Desktop\colors.csv")  


In [17]:
# Initialize node and tag DataFrames
node_df = pd.DataFrame(columns=node_template.columns)
tag_df = pd.DataFrame(columns=tag_template.columns)

In [18]:
# Assign node IDs to countries
country_node_map = {c: i + 1 for i, c in enumerate(normalized_df["Country"].unique())}
normalized_df["node_id"] = normalized_df["Country"].map(country_node_map)

# Define z_layer assignment based on Role
def z_layer(role):
    return 20 if role == "Torchbearer" else 0 if role == "Beneficiary" else 10

normalized_df["z_layer"] = normalized_df["Role"].apply(z_layer)


In [19]:
normalized_df

Unnamed: 0,Country,Indicator,Indicator_Type,SeriesCode,Units,Role,country_code,latitude,longitude,torchbearer_lat,torchbearer_long,beneficiary_lat,beneficiary_long,year,Value,year_interval,normalized_value,ring_location,node_id,z_layer
0,Afghanistan,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,Not Assigned,AF,33.831137,66.024712,0.0,0.0,0.0,0.0,2000,0.0000,2000-2002,0.010000,-135,1,10
1,Albania,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,Not Assigned,AL,41.134553,20.064206,0.0,0.0,0.0,0.0,2000,12.4100,2000-2002,0.085107,-135,2,10
2,Algeria,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,Not Assigned,DZ,28.144114,2.679966,0.0,0.0,0.0,0.0,2000,0.0000,2000-2002,0.010000,-135,3,10
3,American Samoa,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,Not Assigned,AS,-14.300688,-170.718116,0.0,0.0,0.0,0.0,2000,0.0000,2000-2002,0.010000,-135,4,10
4,Andorra,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,Not Assigned,AD,42.545303,1.576286,0.0,0.0,0.0,0.0,2000,0.0000,2000-2002,0.010000,-135,5,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18139,Uzbekistan,16.6.1,Primary government expenditures as a proportio...,GF_XPD_GBPC,PERCENT,Not Assigned,UZ,41.581346,63.421883,0.0,0.0,0.0,0.0,2023,0.0000,2021-2023,0.140276,110,199,10
18140,Vanuatu,16.6.1,Primary government expenditures as a proportio...,GF_XPD_GBPC,PERCENT,Not Assigned,VU,-15.241356,166.872757,0.0,0.0,0.0,0.0,2023,0.0000,2021-2023,0.140276,110,200,10
18141,Viet Nam,16.6.1,Primary government expenditures as a proportio...,GF_XPD_GBPC,PERCENT,Not Assigned,VN,16.166670,107.833330,0.0,0.0,0.0,0.0,2023,98.0583,2021-2023,0.854941,110,202,10
18142,Zambia,16.6.1,Primary government expenditures as a proportio...,GF_XPD_GBPC,PERCENT,Not Assigned,ZM,-14.468804,28.767973,0.0,0.0,0.0,0.0,2023,95.4879,2021-2023,0.836208,110,204,10


In [21]:
# Define SDG-16 Specific Color Map
sdg_color_map = {
    '16.1.1': [100, 0, 255],   
    '16.3.2': [0, 100, 255],   
    '16.5.1': [255, 100, 0],   
    '16.6.1': [0, 255, 200]    
}

# Initialize ring_id and tag id counter
ring_id = normalized_df["node_id"].max() + 1  
np_tag_id_counter = ring_id + 100000


In [22]:
# === FUNCTIONS ===

def create_country_node_tag(node_id, lat, long, z, colors, title, description):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[0].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = node_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = node_id
    pin.update({
        'np_table_id': 1, 'parent_id': 0, 'scale_x': 0.5, 'scale_y': 0.5, 'scale_z': 0.5,
        'translate_x': long, 'translate_y': lat, 'translate_z': z, 'np_geometry_id': 19,
        'np_topo_id': 6, 'np_color_id': 1, 'color_r': colors[0], 'color_g': colors[1], 'color_b': colors[2]
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = node_id
    tag.update({'table_id': 1, 'title': title, 'description': description})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)

def create_year_node_tag(ring_location, parent_id, ring_id, title):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[1].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = ring_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = ring_id
    pin.update({
        'parent_id': parent_id, 'branch_level': 2, 'translate_x': ring_location,
        'scale_x': 0.5, 'scale_y': 0.5, 'scale_z': 0.5, 'np_table_id': 1, 'np_color_id': 20,
        'color_r': 55, 'color_g': 190, 'color_b': 190
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = ring_id
    tag.update({'table_id': 1, 'title': title, 'description': 'Year Ring'})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)

def create_petal_rings(petal_id, parent_id, location, title, colors):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[1].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = petal_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = petal_id
    pin["np_data_id"] = petal_id
    pin.update({
        'parent_id': parent_id, 'branch_level': 3, 'translate_x': location,
        'scale_x': 1, 'scale_y': 1, 'scale_z': 1, 'np_table_id': 1, 'np_topo_id': 3,
        'ratio': 0.1, 'color_r': colors[0], 'color_g': colors[1], 'color_b': colors[2]
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = petal_id
    tag.update({'table_id': 1, 'title': title, 'description': 'SDG'})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)

def link_nodes(link_id, parent_id, child_id, title, colors, ratio):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[0].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = link_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = link_id
    pin.update({
        'np_table_id': 1, 'np_geometry_id': 3, 'np_topo_id': 6, 'np_color_id': 20,
        'ratio': ratio, 'parent_id': parent_id, 'child_id': child_id, 'type': 7,
        'color_r': colors[0], 'color_g': colors[1], 'color_b': colors[2]
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = link_id
    tag.update({'table_id': 1, 'title': title, 'description': title})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)

In [30]:
# === GENERATION ===

torchbearer_year_dict = {}
beneficiary_year_dict = {}
torchbearer_countries_set = set()
beneficiary_countries_set = set()

for i, row in normalized_df.iterrows():
    country = row["Country"]
    role = row["Role"]
    interval = row["year_interval"]
    node_id = row["node_id"]
    lat = row["latitude"]
    lon = row["longitude"]
    z = row["z_layer"]
    indicator = row["Indicator"]
    ring_location = row["ring_location"]

    if role == "Torchbearer":
        if country not in torchbearer_countries_set:
            create_country_node_tag(node_id, lat, lon, z, [0, 225, 0], country, "Torchbearer Country")
            torchbearer_countries_set.add(country)
        if country not in torchbearer_year_dict:
            torchbearer_year_dict[country] = {}
        if interval not in torchbearer_year_dict[country]:
            create_year_node_tag(ring_location, node_id, ring_id, f"{interval} (Torchbearer)")
            torchbearer_year_dict[country][interval] = []
            petal_id = ring_id
            location = -45
            group_df = normalized_df[
                (normalized_df["Country"] == country) &
                (normalized_df["year_interval"] == interval)
            ]
            for indicator_value in group_df["Indicator"].unique():
                petal_id += 1
                location += 30
                color = sdg_color_map.get(indicator_value, [0, 0, 0])
                create_petal_rings(petal_id, ring_id, location, indicator_value, color)
                torchbearer_year_dict[country][interval].append({indicator_value: petal_id})
            ring_id = petal_id + 1

    elif role == "Beneficiary":
        if country not in beneficiary_countries_set:
            create_country_node_tag(node_id, lat, lon, z, [225, 0, 0], country, "Beneficiary Country")
            beneficiary_countries_set.add(country)
        if country not in beneficiary_year_dict:
            beneficiary_year_dict[country] = {}
        if interval not in beneficiary_year_dict[country]:
            create_year_node_tag(ring_location, node_id, ring_id, f"{interval} (Beneficiary)")
            beneficiary_year_dict[country][interval] = []
            petal_id = ring_id
            location = -45
            group_df = normalized_df[
                (normalized_df["Country"] == country) &
                (normalized_df["year_interval"] == interval)
            ]
            for indicator_value in group_df["Indicator"].unique():
                petal_id += 1
                location += 30
                color = sdg_color_map.get(indicator_value, [0, 0, 0])
                create_petal_rings(petal_id, ring_id, location, indicator_value, color)
                beneficiary_year_dict[country][interval].append({indicator_value: petal_id})
            ring_id = petal_id + 1


In [None]:
# === LINK GENERATION ===

link_id = ring_id + 1 

# Build fresh torchbearer and beneficiary dictionaries from current node_df
torchbearer_year_dict_link = {}
beneficiary_year_dict_link = {}

for _, row in normalized_df.iterrows():
    country = row["Country"]
    role = row["Role"]
    interval = row["year_interval"]
    indicator = row["Indicator"]
    node_id = row["node_id"]
    
    # Skip if node_id is not actually generated
    if node_id not in node_df["np_node_id"].values:
        continue

    target_dict = torchbearer_year_dict_link if role == "Torchbearer" else beneficiary_year_dict_link

    if country not in target_dict:
        target_dict[country] = {}
    if interval not in target_dict[country]:
        target_dict[country][interval] = []
    target_dict[country][interval].append({indicator: node_id})

# Store valid node_ids once
valid_ids = set(node_df["np_node_id"])

# Create links without any limit
for _, row in normalized_df.iterrows():
    interval = row["year_interval"]
    indicator = row["Indicator"]
    value = row["normalized_value"]

    # Safely skip unknown indicators
    if indicator not in sdg_color_map:
        continue  

    color = sdg_color_map[indicator]  

    for tb_country, tb_data in torchbearer_year_dict_link.items():
        if interval not in tb_data:
            continue
        for tb_entry in tb_data[interval]:
            tb_id = tb_entry.get(indicator)
            if not tb_id or tb_id not in valid_ids:
                continue
            for b_country, b_data in beneficiary_year_dict_link.items():
                if interval not in b_data:
                    continue
                for b_entry in b_data[interval]:
                    b_id = b_entry.get(indicator)
                    if not b_id or b_id not in valid_ids:
                        continue
                    link_nodes(
                        link_id,
                        parent_id=tb_id,
                        child_id=b_id,
                        title=f"{indicator} | {tb_country} → {b_country}",
                        colors=color,
                        ratio=value
                    )
                    link_id += 1


In [None]:
# === FINAL CLEANUP ===

# Remove any rows with missing node IDs or tag IDs
node_df = node_df[node_df["np_node_id"].notnull()].copy()
tag_df = tag_df[tag_df["np_tag_id"].notnull()].copy()

# Define columns that should be integers
node_ints = [
    'np_node_id', 'type', 'np_data_id', 'selected', 'parent_id', 'branch_level', 'child_id', 'np_tag_id',
    'np_palette_id', 'np_ch_in_id', 'np_ch_out_id', 'ch_sync_time', 'np_palette_id_alt', 'np_color_id_alt',
    'np_material_id', 'np_geometry_id', 'np_color_id', 'color_fade', 'np_texture_id', 'hide', 'freeze',
    'np_topo_id', 'subspace', 'trigger_hi_x', 'trigger_hi_y', 'trigger_hi_z', 'trigger_lo_x', 'trigger_lo_y',
    'trigger_lo_z', 'proximity_x', 'proximity_y', 'proximity_z', 'proximity_mode_x', 'proximity_mode_y',
    'proximity_mode_z', 'segments_x', 'segments_y', 'segments_z', 'tag_mode', 'np_format_id', 'np_table_id', 'size'
]

# Safely check if columns exist before converting (to avoid errors)
existing_node_ints = [col for col in node_ints if col in node_df.columns]

# Convert specified columns to integer type
node_df[existing_node_ints] = node_df[existing_node_ints].astype(int)

# Convert important ID columns to int64
node_df['record_id'] = node_df['record_id'].astype('int64')
tag_df[['np_tag_id', 'record_id']] = tag_df[['np_tag_id', 'record_id']].astype('int64')


In [None]:
# Save node and tag DataFrames to CSV
node_df.to_csv(r"C:\Users\TARUN\Desktop\np_node_SDG-16_Global.csv", mode='w', index=False, encoding='utf-8', lineterminator='\r\n')
tag_df.to_csv(r"C:\Users\TARUN\Desktop\np_tag_SDG-16_Global.csv", mode='w', index=False, encoding='utf-8', lineterminator='\r\n')