In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json

In [2]:
# Load your new dataset (LOCAL path)
dataset_file_path = r"C:\Users\TARUN\Desktop\Combined_SDG6_Countries_Data_Corrected.csv"
df = pd.read_csv(dataset_file_path, header=0, low_memory=False, encoding='utf-8').reset_index(drop=True)

# Get the directory path
directory_path = os.path.dirname(dataset_file_path)
print("Directory path:", directory_path)

# Check missing values
print("\nMissing values per column:\n", df.isnull().sum())


Directory path: C:\Users\TARUN\Desktop

Missing values per column:
 Country           0
Indicator         0
Indicator_Type    0
SeriesCode        0
Units             0
2000              0
2001              0
2002              0
2003              0
2004              0
2005              0
2006              0
2007              0
2008              0
2009              0
2010              0
2011              0
2012              0
2013              0
2014              0
2015              0
2016              0
2017              0
2018              0
2019              0
2020              0
2021              0
2022              0
2023              0
Role              0
dtype: int64


In [4]:
# Create a mapping for countries and indicators to unique numeric codes
country_code_map = {country: idx for idx, country in enumerate(df['Country'].unique())}
indicator_code_map = {indicator: idx for idx, indicator in enumerate(df['Indicator'].unique())}

# Add the codes to the dataframe
df['country_code'] = df['Country'].map(country_code_map)
df['indicator_code'] = df['Indicator'].map(indicator_code_map)

In [5]:
# Print original shape
print("Original shape:")
print(df.shape)

# Drop duplicate rows (across all columns)
df = df.drop_duplicates()

# Print new shape after dropping duplicates
print("New shape after dropping duplicates:")
print(df.shape)


Original shape:
(137, 32)
New shape after dropping duplicates:
(137, 32)


In [6]:
import pycountry
from geopy.geocoders import ArcGIS
from geopy.extra.rate_limiter import RateLimiter
import pandas as pd
import os

# Step 1: Load Dataset
dataset_file_path = r"C:\Users\TARUN\Desktop\Combined_SDG6_Countries_Data_Corrected.csv"
df = pd.read_csv(dataset_file_path, header=0, low_memory=False, encoding='utf-8').reset_index(drop=True)

# --------------------------------------------
# Step 2: Define Helper Functions
# --------------------------------------------
def remove_third_element(point):
    if isinstance(point, tuple):
        return point[:2]
    else:
        return point

def get_code(name):
    try:
        return pycountry.countries.lookup(name).alpha_2
    except LookupError:
        return None

# --------------------------------------------
# Step 3: Setup Geocoder
# --------------------------------------------
geolocator = ArcGIS(user_agent="SDG6-Encoder")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def get_location(row, country_col, code_col):
    country_name = row[country_col]
    country_code = row[code_col]
    if country_code:
        return geocode(f"{country_name}, {country_code}")
    else:
        return geocode(country_name)

# --------------------------------------------
# Step 4: Geocode Unique Countries
# --------------------------------------------
unique_countries = pd.DataFrame(df['Country'].unique(), columns=['Country'])
unique_countries['country_code'] = unique_countries['Country'].apply(get_code)
unique_countries['location'] = unique_countries.apply(lambda row: get_location(row, 'Country', 'country_code'), axis=1)
unique_countries['point'] = unique_countries['location'].apply(lambda loc: (loc.latitude, loc.longitude) if loc else None)
unique_countries['point'] = unique_countries['point'].apply(remove_third_element)
unique_countries['latitude'] = unique_countries['point'].apply(lambda x: x[0] if x else None)
unique_countries['longitude'] = unique_countries['point'].apply(lambda x: x[1] if x else None)

# --------------------------------------------
# Step 5: Drop Existing Geo Columns if Present
# --------------------------------------------
df = df.drop(columns=['country_code', 'latitude', 'longitude'], errors='ignore')

# --------------------------------------------
# Step 6: Merge Geocoded Data Back to Main DataFrame
# --------------------------------------------
df = df.merge(unique_countries[['Country', 'location', 'country_code', 'latitude', 'longitude', 'point']], on='Country', how='left')

# --------------------------------------------
# Step 7: Preview
# --------------------------------------------
print(df[['Country', 'country_code', 'latitude', 'longitude']].drop_duplicates().head())
df.head()


    Country country_code   latitude   longitude
0     China           CN  36.567348  103.930027
4   Austria           AT  46.633290   14.310900
5    France           FR  46.559417    2.550540
6   Germany           DE  51.110631   10.392278
7  Portugal           PT  39.593139   -8.519813


Unnamed: 0,Country,Indicator,Indicator_Type,SeriesCode,Units,2000,2001,2002,2003,2004,...,2020,2021,2022,2023,Role,location,country_code,latitude,longitude,point
0,China,6.3.1,Proportion of safely treated domestic wastewat...,EN_WWT_WWDS,PERCENT,0.0,0.0,0.0,0.0,0.0,...,64.77986,0.0,61.67235,0.0,Recipient,"(China, (36.567348398, 103.930027033))",CN,36.567348,103.930027,"(36.567348398, 103.930027033)"
1,China,6.3.2,Proportion of open water bodies with good ambi...,EN_H2O_OPAMBQ,PERCENT,0.0,0.0,0.0,0.0,0.0,...,76.8,0.0,0.0,73.8,Donor,"(China, (36.567348398, 103.930027033))",CN,36.567348,103.930027,"(36.567348398, 103.930027033)"
2,China,6.5.1,Water Management,ER_H2O_IWRMD_FI,PERCENT,0.0,0.0,0.0,0.0,0.0,...,82.0,0.0,0.0,78.0,Other,"(China, (36.567348398, 103.930027033))",CN,36.567348,103.930027,"(36.567348398, 103.930027033)"
3,China,6.a.1,Total official development assistance (gross d...,DC_TOF_WASHL,CON_USD_M,509.09762,681.55751,234.89932,216.16684,212.64159,...,101.8576,59.25899,59.51261,0.0,Donor,"(China, (36.567348398, 103.930027033))",CN,36.567348,103.930027,"(36.567348398, 103.930027033)"
4,Austria,6.3.1,Proportion of safely treated domestic wastewat...,EN_WWT_WWDS,PERCENT,0.0,0.0,0.0,0.0,0.0,...,98.59576,0.0,98.20522,0.0,Donor,"(Austria, (46.63329, 14.3109))",AT,46.63329,14.3109,"(46.63329, 14.3109)"


In [7]:
import os

# Step 1: Choose directory to save in
directory_path = r"C:\Users\TARUN\Desktop"

# Step 2: Define the output path
geo_path = os.path.join(directory_path, 'GeoLocationInfo.csv')

# Step 3: Save the updated geo-encoded DataFrame
df.to_csv(geo_path, index=False, encoding='utf-8')

print(f"GeoLocationInfo.csv saved for the NEW dataset at:\n{geo_path}")


GeoLocationInfo.csv saved for the NEW dataset at:
C:\Users\TARUN\Desktop\GeoLocationInfo.csv


In [9]:
import pandas as pd

# Step 1: Load the previously saved full GeoLocationInfo file
geo_df = pd.read_csv(r"C:\Users\TARUN\Desktop\GeoLocationInfo.csv")

# Step 1.5: Re-create indicator_code (because it was missing earlier)
indicator_code_map = {indicator: idx for idx, indicator in enumerate(geo_df['Indicator'].unique())}
geo_df['indicator_code'] = geo_df['Indicator'].map(indicator_code_map)

# Step 2: Specify the columns to retain
columns_to_keep = [
    'Country', 'Indicator', 'Indicator_Type', 'SeriesCode', 'Units',
    '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
    '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
    '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023',
    'country_code', 'indicator_code', 'location', 'point',
    'latitude', 'longitude', 'Role'
]

# Step 3: Filter and select only these columns
selected_geo_df = geo_df[columns_to_keep]

# Step 4: Save the final cleaned file to desktop
selected_geo_df.to_csv(r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final.csv", index=False, encoding='utf-8')

# Step 5: Display the result (small preview)
print(selected_geo_df.head())


   Country Indicator                                     Indicator_Type  \
0    China     6.3.1  Proportion of safely treated domestic wastewat...   
1    China     6.3.2  Proportion of open water bodies with good ambi...   
2    China     6.5.1                                   Water Management   
3    China     6.a.1  Total official development assistance (gross d...   
4  Austria     6.3.1  Proportion of safely treated domestic wastewat...   

        SeriesCode      Units       2000       2001       2002       2003  \
0      EN_WWT_WWDS    PERCENT    0.00000    0.00000    0.00000    0.00000   
1    EN_H2O_OPAMBQ    PERCENT    0.00000    0.00000    0.00000    0.00000   
2  ER_H2O_IWRMD_FI    PERCENT    0.00000    0.00000    0.00000    0.00000   
3     DC_TOF_WASHL  CON_USD_M  509.09762  681.55751  234.89932  216.16684   
4      EN_WWT_WWDS    PERCENT    0.00000    0.00000    0.00000    0.00000   

        2004  ...      2021      2022  2023  country_code  indicator_code  \
0    0.00

In [12]:
import pandas as pd
import pycountry

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final.csv")  # Correct path

# Step 2: Create a correction mapping for problematic country names
country_name_fixes = {
    "Bolivia (Plurinational State of)": "Bolivia",
    "China, Hong Kong Special Administrative Region": "China",
    "China, Macao Special Administrative Region": "China",
    "Iran (Islamic Republic of)": "Iran",
    "Micronesia (Federated States of)": "Federated States of Micronesia",
    "Republic of Korea": "South Korea",
    "Republic of Moldova": "Moldova",
    "Russian Federation": "Russia",
    "Syrian Arab Republic": "Syria",
    "United Republic of Tanzania": "Tanzania",
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "Viet Nam": "Vietnam",
    "Kosovo": "XK",  # Kosovo isn't in pycountry; assign known ISO-like code
    "Holy See": "VA",  # Vatican
    "Iraq (Central Iraq)": "Iraq",
    "Iraq (Kurdistan Region)": "Iraq",
    "Netherlands (Kingdom of the)": "Netherlands",
    "State of Palestine": "PS",
    "Saint Helena": "SH",
    "United Kingdom (England and Wales)": "United Kingdom",
    "United Kingdom (Northern Ireland)": "United Kingdom",
    "United Kingdom (Scotland)": "United Kingdom",
    "United States Virgin Islands": "Virgin Islands, U.S.",
    "Democratic Republic of the Congo": "Congo, The Democratic Republic of the",
    "Channel Islands": "Jersey",
    "Wallis and Futuna Islands": "Wallis and Futuna",
    "Other non-specified areas in Eastern Asia": "OTH"  # Special case
}

# Step 3: Define a safe lookup function
def safe_country_code(name):
    if name in country_name_fixes:
        fixed = country_name_fixes[name]
        if len(fixed) == 2:  # If already ISO 2-letter code
            return fixed
        try:
            return pycountry.countries.lookup(fixed).alpha_2
        except LookupError:
            return None
    else:
        try:
            return pycountry.countries.lookup(name).alpha_2
        except LookupError:
            return None

# Step 4: Fill missing country_code values
df['country_code'] = df.apply(
    lambda row: row['country_code'] if pd.notna(row['country_code']) else safe_country_code(row['Country']),
    axis=1
)

# Step 5: Save updated DataFrame
df.to_csv(r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final_Fixed.csv", index=False, encoding='utf-8')
# Optional: Preview rows where country_code is still missing (to double check)
print(df[df['country_code'].isna()][['Country', 'country_code']])

Empty DataFrame
Columns: [Country, country_code]
Index: []


In [14]:
import pandas as pd
import pycountry

# Step 1: Load the updated file
df = pd.read_csv(r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final_Fixed.csv")

# Step 2: Manual overrides for known missing ISO entries
country_name_fixes = {
    "Democratic Republic of the Congo": "Congo, The Democratic Republic of the",
    "United States Virgin Islands": "Virgin Islands, U.S.",
    "Wallis and Futuna Islands": "Wallis and Futuna",
    "Channel Islands": "Jersey",
    "Netherlands (Kingdom of the)": "Netherlands",
    "Kosovo": "XK",
    "Holy See": "VA",
    "Micronesia (Federated States of)": "Federated States of Micronesia",
    "State of Palestine": "Palestine",
    "United Kingdom (England and Wales)": "United Kingdom",
    "United Kingdom (Northern Ireland)": "United Kingdom",
    "United Kingdom (Scotland)": "United Kingdom"
}

manual_code_override = {
    "State of Palestine": "PS",
    "Channel Islands": "JE",
    "Wallis and Futuna Islands": "WF",
    "United States Virgin Islands": "VI",
    "Kosovo": "XK",
    "Holy See": "VA"
}

# Step 3: Safe lookup function
def safe_country_code(name):
    if name in manual_code_override:
        return manual_code_override[name]
    elif name in country_name_fixes:
        fixed = country_name_fixes[name]
        if len(fixed) == 2:
            return fixed
        try:
            return pycountry.countries.lookup(fixed).alpha_2
        except LookupError:
            return None
    else:
        try:
            return pycountry.countries.lookup(name).alpha_2
        except LookupError:
            return None

# Step 4: Apply corrections where needed
df['country_code'] = df.apply(
    lambda row: row['country_code'] if pd.notna(row['country_code']) else safe_country_code(row['Country']),
    axis=1
)

# Step 5: Tag any remaining null country codes as 'REGION'
df['country_code'] = df['country_code'].fillna('REGION')

# Step 6: Save the final corrected version
df.to_csv(r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final_Tagged.csv", index=False, encoding='utf-8')

In [15]:
import pandas as pd
import os

# STEP 1: Load the already geoencoded dataset
geo_df = pd.read_csv(r"C:\Users\TARUN\Desktop\GeoLocationInfo_Final_Tagged.csv")

# STEP 2: Ensure consistent column naming
# If not already present, create a simplified "country_code" column for joining
if 'country_code' not in geo_df.columns:
    geo_df['country_code'] = pd.factorize(geo_df['Country'])[0] + 1

# STEP 3: Separate donors and recipients
donors_df = geo_df[geo_df['Role'] == 'Donor'].drop_duplicates(subset='Country')[['Country', 'latitude', 'longitude']].copy()
recipients_df = geo_df[geo_df['Role'] == 'Recipient'].drop_duplicates(subset='Country')[['Country', 'latitude', 'longitude']].copy()

# STEP 4: Rename columns to make them distinct
donors_df = donors_df.rename(columns={
    'latitude': 'donor_lat',
    'longitude': 'donor_long'
})

recipients_df = recipients_df.rename(columns={
    'latitude': 'recipient_lat',
    'longitude': 'recipient_long'
})

# STEP 5: Merge coordinates back into the main dataframe
merged_df = geo_df.copy()
merged_df = merged_df.merge(donors_df, on='Country', how='left')
merged_df = merged_df.merge(recipients_df, on='Country', how='left')

# STEP 6: Drop duplicate rows if any were introduced
merged_df = merged_df.drop_duplicates()

# STEP 7: Assign null values with 0
merged_df.fillna(0, inplace=True)

# STEP 8: Save the result
save_path = os.path.join(r"C:\Users\TARUN\Desktop\GeoLocationInfo_Merged_With_Roles.csv")
merged_df.to_csv(save_path, index=False)

# Optional: Quick check
print(merged_df.head())


   Country Indicator                                     Indicator_Type  \
0    China     6.3.1  Proportion of safely treated domestic wastewat...   
1    China     6.3.2  Proportion of open water bodies with good ambi...   
2    China     6.5.1                                   Water Management   
3    China     6.a.1  Total official development assistance (gross d...   
4  Austria     6.3.1  Proportion of safely treated domestic wastewat...   

        SeriesCode      Units       2000       2001       2002       2003  \
0      EN_WWT_WWDS    PERCENT    0.00000    0.00000    0.00000    0.00000   
1    EN_H2O_OPAMBQ    PERCENT    0.00000    0.00000    0.00000    0.00000   
2  ER_H2O_IWRMD_FI    PERCENT    0.00000    0.00000    0.00000    0.00000   
3     DC_TOF_WASHL  CON_USD_M  509.09762  681.55751  234.89932  216.16684   
4      EN_WWT_WWDS    PERCENT    0.00000    0.00000    0.00000    0.00000   

        2004  ...  indicator_code  location                          point  \
0    0.0

In [16]:
import pandas as pd
import os

# STEP 1: Load the SDG-6 geolocation-enhanced dataset
geo_path = os.path.join(r"C:\Users\TARUN\Desktop", 'GeoLocationInfo_Merged_With_Roles.csv') 
temp_df = pd.read_csv(geo_path, encoding="utf-8")

# STEP 2: Sort for consistency
temp_df = temp_df.sort_values(by=['Country', 'Role'], ascending=[True, True])

# STEP 3: Extract donor and recipient country lists
donor_countries_list = temp_df[temp_df['Role'] == 'Donor']['Country'].unique()
recipient_countries_list = temp_df[temp_df['Role'] == 'Recipient']['Country'].unique()

# STEP 4: Print useful context
print("Number of donor countries:", len(donor_countries_list))
print("Number of recipient countries:", len(recipient_countries_list))


Number of donor countries: 18
Number of recipient countries: 5


In [18]:
import pandas as pd
import os

# STEP 1: Load the geoencoded SDG-6 dataset
geo_path = os.path.join(r"C:\Users\TARUN\Desktop", 'GeoLocationInfo_Merged_With_Roles.csv')
df = pd.read_csv(geo_path, encoding="utf-8")

# STEP 2: Melt the year columns (2000–2023) to long format
year_columns = [str(year) for year in range(2000, 2024)]
df_melted = df.melt(
    id_vars=[
        "Country", "Indicator", "Indicator_Type", "SeriesCode", "Units",
        "Role", "country_code", "latitude", "longitude",
        "donor_lat", "donor_long", "recipient_lat", "recipient_long"
    ],
    value_vars=year_columns,
    var_name="year",
    value_name="Value"
)

# STEP 3: Fill NaN with 0
df_melted.fillna(0, inplace=True)

# STEP 4: Convert year to int and create 3-year bins
df_melted["year"] = df_melted["year"].astype(int)
min_year = df_melted["year"].min()
max_year = df_melted["year"].max()

# Generate 3-year bin intervals
bin_ranges = []
current = min_year
while current <= max_year:
    end = current + 2  
    bin_ranges.append((current, end))
    current += 3        

def map_to_3yr_bin(year):
    for start, end in bin_ranges:
        if start <= year <= end:
            return f"{start}-{end}"
    return "Other"

df_melted["year_interval"] = df_melted["year"].apply(map_to_3yr_bin)

# STEP 5: Normalize 'Value' column per Indicator_Type
df_melted["normalized_value"] = 0.0
for indicator in df_melted["Indicator_Type"].unique():
    mask = df_melted["Indicator_Type"] == indicator
    min_val = df_melted.loc[mask, "Value"].min()
    max_val = df_melted.loc[mask, "Value"].max()
    df_melted.loc[mask, "normalized_value"] = df_melted.loc[mask, "Value"].apply(
        lambda x: 0.01 + ((x - min_val) / (max_val - min_val) * (2 - 0.01)) if max_val != min_val else 1.0
    )

# STEP 6: Assign ring_location based on year_interval
def assign_ring_locations(df_input):
    intervals = sorted(df_input["year_interval"].unique())
    ring_location_dict = {interval: -135 + i * 35 for i, interval in enumerate(intervals)}
    df_input["ring_location"] = df_input["year_interval"].map(ring_location_dict)
    return df_input

df_melted = assign_ring_locations(df_melted)

# STEP 7: Save the final normalized file
norm_path = os.path.join(r"C:\Users\TARUN\Desktop", 'Normalized_SDG6_Data_Geo_Final.csv')
df_melted.to_csv(norm_path, index=False, encoding='utf-8')

In [19]:
import pandas as pd
import time
import os

# Load normalized dataset
normalized_df = pd.read_csv(r"C:\Users\TARUN\Desktop\Normalized_SDG6_Data_Geo_Final.csv")

# Load node and tag templates
node_template = pd.read_csv(r"C:\Users\TARUN\Desktop\np_node-template.csv")  
tag_template = pd.read_csv(r"C:\Users\TARUN\Desktop\Tag_Template_SDG-6.csv")
Colors = pd.read_csv(r"C:\Users\TARUN\Desktop\colors.csv")


In [20]:
# Initialize node and tag DataFrames
node_df = pd.DataFrame(columns=node_template.columns)
tag_df = pd.DataFrame(columns=tag_template.columns)

In [21]:
# Assign node IDs to countries
country_node_map = {c: i + 1 for i, c in enumerate(normalized_df["Country"].unique())}
normalized_df["node_id"] = normalized_df["Country"].map(country_node_map)

# Define z_layer assignment based on Role
def z_layer(role):
    return 20 if role == "Donor" else 0 if role == "Recipient" else 10

normalized_df["z_layer"] = normalized_df["Role"].apply(z_layer)


In [22]:
normalized_df

Unnamed: 0,Country,Indicator,Indicator_Type,SeriesCode,Units,Role,country_code,latitude,longitude,donor_lat,donor_long,recipient_lat,recipient_long,year,Value,year_interval,normalized_value,ring_location,node_id,z_layer
0,China,6.3.1,Proportion of safely treated domestic wastewat...,EN_WWT_WWDS,PERCENT,Recipient,CN,36.567348,103.930027,36.567348,103.930027,36.567348,103.930027,2000,0.00000,2000-2002,0.010000,-135,1,0
1,China,6.3.2,Proportion of open water bodies with good ambi...,EN_H2O_OPAMBQ,PERCENT,Donor,CN,36.567348,103.930027,36.567348,103.930027,36.567348,103.930027,2000,0.00000,2000-2002,0.010000,-135,1,20
2,China,6.5.1,Water Management,ER_H2O_IWRMD_FI,PERCENT,Other,CN,36.567348,103.930027,36.567348,103.930027,36.567348,103.930027,2000,0.00000,2000-2002,0.010000,-135,1,10
3,China,6.a.1,Total official development assistance (gross d...,DC_TOF_WASHL,CON_USD_M,Donor,CN,36.567348,103.930027,36.567348,103.930027,36.567348,103.930027,2000,509.09762,2000-2002,1.496455,-135,1,20
4,Austria,6.3.1,Proportion of safely treated domestic wastewat...,EN_WWT_WWDS,PERCENT,Donor,AT,46.633290,14.310900,46.633290,14.310900,0.000000,0.000000,2000,0.00000,2000-2002,0.010000,-135,2,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3283,Armenia,6.a.1,Total official development assistance (gross d...,DC_TOF_WASHL,CON_USD_M,Recipient,AM,40.293085,44.940221,0.000000,0.000000,40.293085,44.940221,2023,0.00000,2021-2023,0.010000,110,11,0
3284,Ukraine,6.a.1,Total official development assistance (gross d...,DC_TOF_WASHL,CON_USD_M,Recipient,UA,49.026898,31.374926,0.000000,0.000000,49.026898,31.374926,2023,0.00000,2021-2023,0.010000,110,44,0
3285,United States of America,6.3.1,Proportion of safely treated domestic wastewat...,EN_WWT_WWDS,PERCENT,Other,US,39.398703,-99.414619,39.398703,-99.414619,0.000000,0.000000,2023,0.00000,2021-2023,0.010000,110,46,10
3286,United States of America,6.3.2,Proportion of open water bodies with good ambi...,EN_H2O_OPAMBQ,PERCENT,Donor,US,39.398703,-99.414619,39.398703,-99.414619,0.000000,0.000000,2023,45.63000,2021-2023,0.918037,110,46,20


In [23]:
# Define SDG-6 Specific Color Map 
sdg_color_map = {
    '6.3.1': [0, 128, 255],  
    '6.3.2': [0, 255, 128],  
    '6.a.1': [255, 128, 0],
    '6.5.1': [255, 0, 128]
}

# Initialize ring_id and tag id counter
ring_id = normalized_df["node_id"].max() + 1  
np_tag_id_counter = ring_id + 100000


In [24]:
# === FUNCTIONS ===

def create_country_node_tag(node_id, lat, long, z, colors, title, description):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[0].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = node_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = node_id
    pin.update({
        'np_table_id': 1, 'parent_id': 0, 'scale_x': 0.5, 'scale_y': 0.5, 'scale_z': 0.5,
        'translate_x': long, 'translate_y': lat, 'translate_z': z, 'np_geometry_id': 19,
        'np_topo_id': 6, 'np_color_id': 1, 'color_r': colors[0], 'color_g': colors[1], 'color_b': colors[2]
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = node_id
    tag.update({'table_id': 1, 'title': title, 'description': description})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)

def create_year_node_tag(ring_location, parent_id, ring_id, title):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[1].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = ring_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = ring_id
    pin.update({
        'parent_id': parent_id, 'branch_level': 2, 'translate_x': ring_location,
        'scale_x': 0.5, 'scale_y': 0.5, 'scale_z': 0.5, 'np_table_id': 1, 'np_color_id': 20,
        'color_r': 55, 'color_g': 190, 'color_b': 190
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = ring_id
    tag.update({'table_id': 1, 'title': title, 'description': 'Year Ring'})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)

def create_petal_rings(petal_id, parent_id, location, title, colors):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[1].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = petal_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = petal_id
    pin["np_data_id"] = petal_id
    pin.update({
        'parent_id': parent_id, 'branch_level': 3, 'translate_x': location,
        'scale_x': 1, 'scale_y': 1, 'scale_z': 1, 'np_table_id': 1, 'np_topo_id': 3,
        'ratio': 0.1, 'color_r': colors[0], 'color_g': colors[1], 'color_b': colors[2]
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = petal_id
    tag.update({'table_id': 1, 'title': title, 'description': 'SDG'})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)

def link_nodes(link_id, parent_id, child_id, title, colors, ratio):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[0].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = link_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = link_id
    pin.update({
        'np_table_id': 1, 'np_geometry_id': 3, 'np_topo_id': 6, 'np_color_id': 20,
        'ratio': ratio, 'parent_id': parent_id, 'child_id': child_id, 'type': 7,
        'color_r': colors[0], 'color_g': colors[1], 'color_b': colors[2]
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = link_id
    tag.update({'table_id': 1, 'title': title, 'description': title})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)


In [26]:
# === GENERATION ===

donor_year_dict = {}
recipient_year_dict = {}
donor_countries_set = set()
recipient_countries_set = set()

for i, row in normalized_df.iterrows():
    country = row["Country"]
    role = row["Role"]
    interval = row["year_interval"]
    node_id = row["node_id"]
    lat = row["latitude"]
    lon = row["longitude"]
    z = row["z_layer"]
    indicator = row["Indicator"]
    ring_location = row["ring_location"]

    if role == "Donor":
        if country not in donor_countries_set:
            create_country_node_tag(node_id, lat, lon, z, [0, 225, 0], country, "Donor Country")
            donor_countries_set.add(country)
        if country not in donor_year_dict:
            donor_year_dict[country] = {}
        if interval not in donor_year_dict[country]:
            create_year_node_tag(ring_location, node_id, ring_id, f"{interval} (Donor)")
            donor_year_dict[country][interval] = []
            petal_id = ring_id
            location = -45
            group_df = normalized_df[
                (normalized_df["Country"] == country) &
                (normalized_df["year_interval"] == interval)
            ]
            for indicator_value in group_df["Indicator"].unique():
                petal_id += 1
                location += 30
                color = sdg_color_map.get(indicator_value, [0, 0, 0])
                create_petal_rings(petal_id, ring_id, location, indicator_value, color)
                donor_year_dict[country][interval].append({indicator_value: petal_id})
            ring_id = petal_id + 1

    elif role == "Recipient":
        if country not in recipient_countries_set:
            create_country_node_tag(node_id, lat, lon, z, [225, 0, 0], country, "Recipient Country")
            recipient_countries_set.add(country)
        if country not in recipient_year_dict:
            recipient_year_dict[country] = {}
        if interval not in recipient_year_dict[country]:
            create_year_node_tag(ring_location, node_id, ring_id, f"{interval} (Recipient)")
            recipient_year_dict[country][interval] = []
            petal_id = ring_id
            location = -45
            group_df = normalized_df[
                (normalized_df["Country"] == country) &
                (normalized_df["year_interval"] == interval)
            ]
            for indicator_value in group_df["Indicator"].unique():
                petal_id += 1
                location += 30
                color = sdg_color_map.get(indicator_value, [0, 0, 0])
                create_petal_rings(petal_id, ring_id, location, indicator_value, color)
                recipient_year_dict[country][interval].append({indicator_value: petal_id})
            ring_id = petal_id + 1


In [None]:
# === LINK GENERATION ===

link_id = ring_id + 1  # Start link IDs after last petal IDs

# Build fresh donor and recipient dictionaries from current node_df
donor_year_dict = {}
recipient_year_dict = {}

for _, row in normalized_df.iterrows():
    country = row["Country"]
    role = row["Role"]
    interval = row["year_interval"]
    indicator = row["Indicator"]
    node_id = row["node_id"]
    
    # Skip if node_id is not actually generated
    if node_id not in node_df["np_node_id"].values:
        continue

    target_dict = donor_year_dict if role == "Donor" else recipient_year_dict

    if country not in target_dict:
        target_dict[country] = {}
    if interval not in target_dict[country]:
        target_dict[country][interval] = []
    target_dict[country][interval].append({indicator: node_id})

# Store valid node_ids once
valid_ids = set(node_df["np_node_id"])

# Create links without any limit
for _, row in normalized_df.iterrows():
    interval = row["year_interval"]
    indicator = row["Indicator"]
    value = row["normalized_value"]

    # Safely skip unknown indicators
    if indicator not in sdg_color_map:
        continue  # Skip this row if no color defined

    color = sdg_color_map[indicator]  # Safe strict color fetching

    for d_country, d_data in donor_year_dict.items():
        if interval not in d_data:
            continue
        for d_entry in d_data[interval]:
            d_id = d_entry.get(indicator)
            if not d_id or d_id not in valid_ids:
                continue
            for r_country, r_data in recipient_year_dict.items():
                if interval not in r_data:
                    continue
                for r_entry in r_data[interval]:
                    r_id = r_entry.get(indicator)
                    if not r_id or r_id not in valid_ids:
                        continue
                    link_nodes(
                        link_id,
                        parent_id=d_id,
                        child_id=r_id,
                        title=f"{indicator} | {d_country} → {r_country}",
                        colors=color,
                        ratio=value
                    )
                    link_id += 1


In [None]:
# === FINAL CLEANUP ===

# Remove any rows with missing node IDs or tag IDs
node_df = node_df[node_df["np_node_id"].notnull()].copy()
tag_df = tag_df[tag_df["np_tag_id"].notnull()].copy()

# Define columns that should be integers
node_ints = [
    'np_node_id', 'type', 'np_data_id', 'selected', 'parent_id', 'branch_level', 'child_id', 'np_tag_id',
    'np_palette_id', 'np_ch_in_id', 'np_ch_out_id', 'ch_sync_time', 'np_palette_id_alt', 'np_color_id_alt',
    'np_material_id', 'np_geometry_id', 'np_color_id', 'color_fade', 'np_texture_id', 'hide', 'freeze',
    'np_topo_id', 'subspace', 'trigger_hi_x', 'trigger_hi_y', 'trigger_hi_z', 'trigger_lo_x', 'trigger_lo_y',
    'trigger_lo_z', 'proximity_x', 'proximity_y', 'proximity_z', 'proximity_mode_x', 'proximity_mode_y',
    'proximity_mode_z', 'segments_x', 'segments_y', 'segments_z', 'tag_mode', 'np_format_id', 'np_table_id', 'size'
]

# Convert specified columns to int type
node_df[node_ints] = node_df[node_ints].astype(int)

# Convert important ID columns to int64
node_df['record_id'] = node_df['record_id'].astype('int64')
tag_df[["np_tag_id", "record_id"]] = tag_df[["np_tag_id", "record_id"]].astype(int)

In [None]:
# Save node and tag DataFrames to CSV
node_df.to_csv(r"C:\Users\TARUN\Desktop\np_node_SDG-6_EU_US_china.csv", mode='w', index=False, encoding='utf-8', lineterminator='\r\n')
tag_df.to_csv(r"C:\Users\TARUN\Desktop\np_tag_SDG-6_EU_US_china.csv", mode='w', index=False, encoding='utf-8', lineterminator='\r\n')