In [13]:
import requests
import zipfile
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import json

In [14]:
dataset_file_path = r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\final_dataset_sdg61617_with_roles.csv" 
df = pd.read_csv(dataset_file_path, header=0,  low_memory=False, encoding='utf-8').reset_index(drop=True)
directory_path = os.path.dirname(dataset_file_path)
print(directory_path)
print("\nMissing values per column:\n", df.isnull().sum())

C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files

Missing values per column:
 Country           0
Indicator         0
Indicator_Type    0
SeriesCode        0
Units             0
2000              0
2001              0
2002              0
2003              0
2004              0
2005              0
2006              0
2007              0
2008              0
2009              0
2010              0
2011              0
2012              0
2013              0
2014              0
2015              0
2016              0
2017              0
2018              0
2019              0
2020              0
2021              0
2022              0
2023              0
Role              0
dtype: int64


In [15]:
# Create a mapping for countries to unique numeric codes
country_code_map = {country: idx for idx, country in enumerate(df['Country'].unique())}
indicator_code_map = {indicator: idx for idx, indicator in enumerate(df['Indicator'].unique())}

# Add those codes to the dataframe
df['country_code'] = df['Country'].map(country_code_map)
df['indicator_code'] = df['Indicator'].map(indicator_code_map)
# Create a mapping for countries to unique numeric codes
country_code_map = {country: idx for idx, country in enumerate(df['Country'].unique())}
indicator_code_map = {indicator: idx for idx, indicator in enumerate(df['Indicator'].unique())}

# Add those codes to the dataframe
df['country_code'] = df['Country'].map(country_code_map)
df['indicator_code'] = df['Indicator'].map(indicator_code_map)

In [16]:
print("Original shape: ")
print(df.shape)

# Drop duplicate rows (across all columns including yearly values)
df = df.drop_duplicates()

print("New shape after dropping duplicates: ")
print(df.shape)

Original shape: 
(1649, 32)
New shape after dropping duplicates: 
(1649, 32)


### Geo Encoding

In [17]:
import pandas as pd
import pycountry
from geopy.geocoders import ArcGIS
from geopy.extra.rate_limiter import RateLimiter

In [18]:
# Step 2: Define Helper Functions
# --------------------------------------------
def remove_third_element(point):
    """Removes the third element from a tuple if it exists."""
    if isinstance(point, tuple):
        return point[:2]
    else:
        return point

def get_code(name):
    """Gets the 2-letter ISO country code for a given country name."""
    try:
        return pycountry.countries.lookup(name).alpha_2
    except LookupError:
        return None

# --------------------------------------------
# Step 3: Setup Geocoder
# --------------------------------------------
geolocator = ArcGIS(user_agent="DAEN-690-Capstone")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def get_location(row, country_col, code_col):
    """Returns a geopy location object from country name and code."""
    country_name = row[country_col]
    country_code = row[code_col]
    if country_code:
        return geocode(f"{country_name}, {country_code}")
    else:
        return geocode(country_name)

# --------------------------------------------
# Step 4: Geocode Unique Countries
# --------------------------------------------
unique_countries = pd.DataFrame(df['Country'].unique(), columns=['Country'])
unique_countries['country_code'] = unique_countries['Country'].apply(get_code)
unique_countries['location'] = unique_countries.apply(lambda row: get_location(row, 'Country', 'country_code'), axis=1)
unique_countries['point'] = unique_countries['location'].apply(lambda loc: (loc.latitude, loc.longitude) if loc else None)
unique_countries['point'] = unique_countries['point'].apply(remove_third_element)
unique_countries['latitude'] = unique_countries['point'].apply(lambda x: x[0] if x else None)
unique_countries['longitude'] = unique_countries['point'].apply(lambda x: x[1] if x else None)

# --------------------------------------------
# Step 5: Drop Existing Geo Columns if Present
# --------------------------------------------
df = df.drop(columns=['country_code', 'latitude', 'longitude'], errors='ignore')

# --------------------------------------------
# Step 6: Merge Geocoded Data Back to Main DataFrame
# --------------------------------------------
df = df.merge(unique_countries[['Country', 'location', 'country_code', 'latitude', 'longitude', 'point',]], on='Country', how='left')

# --------------------------------------------
# Step 7: Preview
# --------------------------------------------
print(df[['Country', 'country_code', 'latitude', 'longitude']].drop_duplicates().head())
df.head()

          Country country_code   latitude   longitude
0     Afghanistan           AF  33.831137   66.024712
1         Albania           AL  41.134553   20.064206
2         Algeria           DZ  28.144114    2.679966
3  American Samoa           AS -14.300688 -170.718116
4         Andorra           AD  42.545303    1.576286


Unnamed: 0,Country,Indicator,Indicator_Type,SeriesCode,Units,2000,2001,2002,2003,2004,...,2021,2022,2023,Role,indicator_code,location,country_code,latitude,longitude,point
0,Afghanistan,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,0.0,0.0,0.0,0.0,0.0,...,4.02,0.0,0.0,Not Assigned,0,"(Afghanistan, (33.831137065, 66.024711797))",AF,33.831137,66.024712,"(33.831137065, 66.024711797)"
1,Albania,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,12.41,21.01,20.83,16.17,12.87,...,6.94,4.96,0.0,Not Assigned,0,"(Albania, (41.134553284, 20.064206431))",AL,41.134553,20.064206,"(41.134553284, 20.064206431)"
2,Algeria,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,0.0,0.0,0.0,0.0,0.0,...,4.69,5.27,0.0,Not Assigned,0,"(Algeria, (28.144113769, 2.679965933))",DZ,28.144114,2.679966,"(28.144113769, 2.679965933)"
3,American Samoa,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,0.0,1.71,12.03,5.18,6.94,...,0.0,0.0,0.0,Not Assigned,0,"(American Samoa, (-14.30068806, -170.718116122))",AS,-14.300688,-170.718116,"(-14.30068806, -170.718116122)"
4,Andorra,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,0.0,0.0,0.0,0.0,1.3,...,0.0,0.0,0.0,Not Assigned,0,"(Andorra, (42.545303201, 1.576286302))",AD,42.545303,1.576286,"(42.545303201, 1.576286302)"


In [20]:
import os

# Step 1: Choose directory to save in
directory_path = "C:/Users/rohit/OneDrive/Documents/DAEN690/Final_Code-Files/" 

# Step 2: Define the output path
geo_path = os.path.join(directory_path, 'GeoLocationInfo.csv')

# Step 3: Save the new geoencoded DataFrame
df.to_csv(geo_path, index=False, encoding='utf-8')

print(f"GeoLocationInfo.csv saved for the NEW dataset at:\n{geo_path}")

GeoLocationInfo.csv saved for the NEW dataset at:
C:/Users/rohit/OneDrive/Documents/DAEN690/Final_Code-Files/GeoLocationInfo.csv


In [29]:
import pandas as pd

# Load the previously saved full GeoLocationInfo file
geo_df = pd.read_csv(r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\GeoLocationInfo.csv")  # Adjust path if needed

# Step 2: Specify the columns to retain
columns_to_keep = [
    'Country', 'Indicator', 'Indicator_Type', 'SeriesCode', 'Units',
    '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
    '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
    '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023',
    'country_code', 'indicator_code', 'location', 'point',
    'latitude', 'longitude', 'Role'
]

# Step 3: Filter the DataFrame
# Step 3: Filter and rename columns
selected_geo_df = geo_df[columns_to_keep]
#print(selected_geo_df)


# Step 3: Save the final cleaned file
selected_geo_df.to_csv(r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\GeoLocationInfo_Final.csv", index=False, encoding='utf-8')

# Step 4: Display the result
print(selected_geo_df.head())


          Country Indicator Indicator_Type   SeriesCode           Units  \
0     Afghanistan    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
1         Albania    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
2         Algeria    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
3  American Samoa    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   
4         Andorra    16.1.1       Homicide  VC_IHR_PSRC  PER_100000_POP   

    2000   2001   2002   2003   2004  ...  2021  2022  2023  country_code  \
0   0.00   0.00   0.00   0.00   0.00  ...  4.02  0.00   0.0            AF   
1  12.41  21.01  20.83  16.17  12.87  ...  6.94  4.96   0.0            AL   
2   0.00   0.00   0.00   0.00   0.00  ...  4.69  5.27   0.0            DZ   
3   0.00   1.71  12.03   5.18   6.94  ...  0.00  0.00   0.0            AS   
4   0.00   0.00   0.00   0.00   1.30  ...  0.00  0.00   0.0            AD   

   indicator_code        location                           point   latitude  \
0     

In [30]:
import pandas as pd
import pycountry

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\GeoLocationInfo_Final.csv")  # adjust path if needed

# Step 2: Create a correction mapping for problematic country names
country_name_fixes = {
    "Bolivia (Plurinational State of)": "Bolivia",
    "China, Hong Kong Special Administrative Region": "China",
    "China, Macao Special Administrative Region": "China",
    "Iran (Islamic Republic of)": "Iran",
    "Micronesia (Federated States of)": "Federated States of Micronesia",
    "Republic of Korea": "South Korea",
    "Republic of Moldova": "Moldova",
    "Russian Federation": "Russia",
    "Syrian Arab Republic": "Syria",
    "United Republic of Tanzania": "Tanzania",
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "Viet Nam": "Vietnam",
    "Kosovo": "XK",  # Kosovo isn't in pycountry; assign known ISO-like code
    "Holy See": "VA",  # Vatican
    "Iraq (Central Iraq)": "Iraq",
    "Iraq (Kurdistan Region)": "Iraq",
    "Netherlands (Kingdom of the)": "Netherlands",
    "State of Palestine": "PS",
    "Saint Helena": "SH",  # Not always in pycountry; ISO code is SH
    "United Kingdom (England and Wales)": "United Kingdom",
    "United Kingdom (Northern Ireland)": "United Kingdom",
    "United Kingdom (Scotland)": "United Kingdom",
    "United States Virgin Islands": "Virgin Islands, U.S.",
    "Democratic Republic of the Congo": "Congo, The Democratic Republic of the",
    "Channel Islands": "Jersey",  # or use direct code: 'JE'
    "Wallis and Futuna Islands": "Wallis and Futuna",
    "Other non-specified areas in Eastern Asia": "OTH"  # No official ISO code
    # Add more mappings as needed
}

# Step 3: Define a safe lookup function with fallback
def safe_country_code(name):
    if name in country_name_fixes:
        fixed = country_name_fixes[name]
        if len(fixed) == 2:  # Already an ISO code (e.g., XK, VA)
            return fixed
        try:
            return pycountry.countries.lookup(fixed).alpha_2
        except LookupError:
            return None
    else:
        try:
            return pycountry.countries.lookup(name).alpha_2
        except LookupError:
            return None

# Step 4: Fill in missing country_code values
df['country_code'] = df.apply(
    lambda row: row['country_code'] if pd.notna(row['country_code']) else safe_country_code(row['Country']),
    axis=1
)


# Step 5: Save updated DataFrame
df.to_csv(r"C:/Users/rohit/OneDrive/Documents/DAEN690/Final_Code-Files/GeoLocationInfo_Final_Fixed.csv", index=False, encoding='utf-8')

# Optional: Preview fixed rows
print(df[df['country_code'].isna()][['Country', 'country_code']])


                                        Country country_code
133   Other non-specified areas in Eastern Asia         None
335   Other non-specified areas in Eastern Asia         None
523   Other non-specified areas in Eastern Asia         None
888              Netherlands Antilles  [former]         None
897   Other non-specified areas in Eastern Asia         None
...                                         ...          ...
1615            Southern Asia (excluding India)         None
1616                            Southern Europe         None
1619                         Sub-Saharan Africa         None
1644                             Western Africa         None
1645                               Western Asia         None

[110 rows x 2 columns]


In [32]:
import pandas as pd
import pycountry

# Load the updated file
df = pd.read_csv(r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\GeoLocationInfo_Final_Fixed.csv")  # Adjust path if needed

# Manual overrides for known missing ISO entries
country_name_fixes = {
    "Democratic Republic of the Congo": "Congo, The Democratic Republic of the",
    "United States Virgin Islands": "Virgin Islands, U.S.",
    "Wallis and Futuna Islands": "Wallis and Futuna",
    "Channel Islands": "Jersey",
    "Netherlands (Kingdom of the)": "Netherlands",
    "Kosovo": "XK",
    "Holy See": "VA",
    "Micronesia (Federated States of)": "Federated States of Micronesia",
    "State of Palestine": "Palestine",
    "United Kingdom (England and Wales)": "United Kingdom",
    "United Kingdom (Northern Ireland)": "United Kingdom",
    "United Kingdom (Scotland)": "United Kingdom"
}

manual_code_override = {
    "State of Palestine": "PS",
    "Channel Islands": "JE",
    "Wallis and Futuna Islands": "WF",
    "United States Virgin Islands": "VI",
    "Kosovo": "XK",
    "Holy See": "VA"
}

# Safe lookup function with fallbacks
def safe_country_code(name):
    if name in manual_code_override:
        return manual_code_override[name]
    elif name in country_name_fixes:
        fixed = country_name_fixes[name]
        if len(fixed) == 2:
            return fixed
        try:
            return pycountry.countries.lookup(fixed).alpha_2
        except LookupError:
            return None
    else:
        try:
            return pycountry.countries.lookup(name).alpha_2
        except LookupError:
            return None

# Apply corrections only where country_code is missing
df['country_code'] = df.apply(
    lambda row: row['country_code'] if pd.notna(row['country_code']) else safe_country_code(row['Country']),
    axis=1
)

# Tag remaining null values with "REGION"
df['country_code'] = df['country_code'].fillna('REGION')

# Save final corrected version
df.to_csv(r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\GeoLocationInfo_Final_Tagged.csv", index=False, encoding='utf-8')

# Preview the first few rows
print(df[['Country', 'country_code']].head(10))




               Country country_code
0          Afghanistan           AF
1              Albania           AL
2              Algeria           DZ
3       American Samoa           AS
4              Andorra           AD
5               Angola           AO
6             Anguilla           AI
7  Antigua and Barbuda           AG
8            Argentina           AR
9              Armenia           AM


In [49]:
import pandas as pd
import os

# STEP 1: Load the already geoencoded dataset
geo_df = pd.read_csv(r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\GeoLocationInfo_Final_Tagged.csv")

# STEP 2: Ensure consistent column naming
# If not already present, create a simplified "country_code" column for joining
if 'country_code' not in geo_df.columns:
    geo_df['country_code'] = pd.factorize(geo_df['Country'])[0] + 1

# STEP 3: Separate torchbearers and beneficiaries
torchbearers_df = geo_df[geo_df['Role'] == 'Torchbearer'].drop_duplicates(subset='Country')[['Country', 'latitude', 'longitude']].copy()
beneficiaries_df = geo_df[geo_df['Role'] == 'Beneficiary'].drop_duplicates(subset='Country')[['Country', 'latitude', 'longitude']].copy()

# STEP 4: Rename columns to make them distinct
torchbearers_df = torchbearers_df.rename(columns={
    'latitude': 'torchbearer_lat',
    'longitude': 'torchbearer_long'
})

beneficiaries_df = beneficiaries_df.rename(columns={
    'latitude': 'beneficiary_lat',
    'longitude': 'beneficiary_long'
})

# STEP 5: Merge coordinates back into the main dataframe
merged_df = geo_df.copy()
merged_df = merged_df.merge(torchbearers_df, on='Country', how='left')
merged_df = merged_df.merge(beneficiaries_df, on='Country', how='left')

# STEP 6: Drop duplicate rows if any were introduced
merged_df = merged_df.drop_duplicates()

# Assign null values with 0
merged_df.fillna(0, inplace=True)

# STEP 7: Save the result
save_path = os.path.join(r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\GeoLocationInfo_SDG61617_Merged_With_Roles.csv")
merged_df.to_csv(save_path, index=False)

#print("✅ Done! Merged dataset with role-based coordinates saved to:", save_path)


In [None]:
# # Step 1: Split dataset by role
# torchbearers_df = df[df['Role'] == 'Torchbearer'].drop_duplicates(subset='Country')[['Country']].copy()
# beneficiaries_df = df[df['Role'] == 'Beneficiary'].drop_duplicates(subset='Country')[['Country']].copy()

# # Step 2: Add ISO 2-letter country codes
# torchbearers_df['Country Code'] = torchbearers_df['Country'].apply(get_code)
# beneficiaries_df['Country Code'] = beneficiaries_df['Country'].apply(get_code)

# # Step 3: Define get_location (reuse from previous chunk)
# # Already defined above

# # Step 4: Geocode torchbearers
# torchbearers_df['location'] = torchbearers_df.apply(get_location, axis=1)
# torchbearers_df['point'] = torchbearers_df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# torchbearers_df['point'] = torchbearers_df['point'].apply(remove_third_element)
# torchbearers_df[['torchbearer_lat', 'torchbearer_long']] = pd.DataFrame(torchbearers_df['Point'].tolist(), index=torchbearers_df.index)

# # Step 5: Geocode beneficiaries
# beneficiaries_df['location'] = beneficiaries_df.apply(get_location, axis=1)
# beneficiaries_df['point'] = beneficiaries_df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# beneficiaries_df['point'] = beneficiaries_df['point'].apply(remove_third_element)
# beneficiaries_df[['beneficiary_lat', 'beneficiary_long']] = pd.DataFrame(beneficiaries_df['Point'].tolist(), index=beneficiaries_df.index)

# # Step 6: Merge geocoded lat/long into main DataFrame
# merged_df = df.copy()
# merged_df = merged_df.merge(torchbearers_df[['Country', 'torchbearer_lat', 'torchbearer_long']], on='Country', how='left')
# merged_df = merged_df.merge(beneficiaries_df[['Country', 'beneficiary_lat', 'beneficiary_long']], on='Country', how='left')

# # Step 7: Drop temp columns
# if 'Location' in merged_df.columns:
#     merged_df = merged_df.drop(columns=['Location'])

# # Step 8: Final cleaning
# print("Before:", merged_df.shape)
# merged_df = merged_df.drop_duplicates()
# print("After:", merged_df.shape)

# merged_df.fillna(0)

# # Preview
# print(merged_df.head())

# # Step 9: Export to CSV
# directory_path = os.path.dirname(r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\SDG16_Dataset_With_Roles.csv")
# geo_path = os.path.join(directory_path, 'GeoLocationInfo_SDG61617_Subset.csv')
# merged_df.to_csv(geo_path, index=False, encoding='utf-8')


TypeError: get_location() missing 2 required positional arguments: 'country_col' and 'code_col'

In [52]:
# Load the SDG-6,16 and 17 geolocation-enhanced dataset
geo_path = os.path.join(directory_path, 'GeoLocationInfo_SDG61617_Merged_With_Roles.csv')
temp_df = pd.read_csv(geo_path, encoding="utf-8")

# Sort for consistency
temp_df = temp_df.sort_values(by=['Country', 'Role'], ascending=[True, True])

# Extract torchbearer and beneficiary country lists
torchbearer_countries_list = temp_df[temp_df['Role'] == 'Torchbearer']['Country'].unique()
beneficiary_countries_list = temp_df[temp_df['Role'] == 'Beneficiary']['Country'].unique()

# Print useful context
print("Number of torchbearer countries:", len(torchbearer_countries_list))
print("Number of beneficiary countries:", len(beneficiary_countries_list))

Number of torchbearer countries: 16
Number of beneficiary countries: 25


In [54]:
import pandas as pd

# Load the geoencoded SDG 6+16+17 dataset
df = pd.read_csv(geo_path, encoding="utf-8")

# Step 1: Melt the year columns (2000–2023) to long format
year_columns = [str(year) for year in range(2000, 2024)]
df_melted = df.melt(
    id_vars=[
        "Country", "Indicator", "Indicator_Type", "SeriesCode", "Units",
        "Role", "country_code", "latitude", "longitude"
    ],
    value_vars=year_columns,
    var_name="year",
    value_name="Value"
)

# Step 2: Fill NaN with 0
df_melted.fillna(0, inplace=True)

# Step 3: Convert year to int and create 6-year bins
df_melted["year"] = df_melted["year"].astype(int)
min_year = df_melted["year"].min()
max_year = df_melted["year"].max()

# Generate 6-year bin intervals
bin_ranges = []
current = min_year
while current <= max_year:
    end = current + 5
    bin_ranges.append((current, end))
    current += 6

def map_to_6yr_bin(year):
    for start, end in bin_ranges:
        if start <= year <= end:
            return f"{start}-{end}"
    return "Other"

df_melted["year_interval"] = df_melted["year"].apply(map_to_6yr_bin)

# Step 4: Normalize 'Value' column per Indicator_Type
df_melted["normalized_value"] = 0.0
for indicator in df_melted["Indicator_Type"].unique():
    mask = df_melted["Indicator_Type"] == indicator
    min_val = df_melted.loc[mask, "Value"].min()
    max_val = df_melted.loc[mask, "Value"].max()
    df_melted.loc[mask, "normalized_value"] = df_melted.loc[mask, "Value"].apply(
        lambda x: 0.01 + ((x - min_val) / (max_val - min_val) * (2 - 0.01)) if max_val != min_val else 1.0
    )

# Step 5: Assign ring_location based on year_interval
def assign_ring_locations(df_input):
    intervals = sorted(df_input["year_interval"].unique())
    ring_location_dict = {interval: -135 + i * 35 for i, interval in enumerate(intervals)}
    df_input["ring_location"] = df_input["year_interval"].map(ring_location_dict)
    return df_input

df_melted = assign_ring_locations(df_melted)

# Step 6: Assign node_id based on (Country + Indicator_Type)
# df_melted["node_key"] = df_melted["Country"] + "_" + df_melted["Indicator_Type"]
# node_id_map = {key: idx for idx, key in enumerate(df_melted["node_key"].unique())}
# df_melted["node_id"] = df_melted["node_key"].map(node_id_map)

# Step 7: Save the final normalized file
norm_path = os.path.join(directory_path, 'Normalized_SDG61617_Data_Geo_Final.csv')
df_melted.to_csv(norm_path, index=False, encoding='utf-8')

In [55]:
import pandas as pd
import time

# Load inputs
normalized_df = pd.read_csv(r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\Normalized_SDG61617_Data_Geo_Final.csv")
node_template = pd.read_csv(r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\np_node-template.csv")
tag_template = pd.read_csv(r"C:\Users\rohit\OneDrive\Documents\DAEN690\Final_Code-Files\np_tag-template.csv")

In [56]:
# Init node/tag dataframes
node_df = pd.DataFrame(columns=node_template.columns)
tag_df = pd.DataFrame(columns=tag_template.columns)

In [57]:
# Assign IDs and Z-layer
country_node_map = {c: i + 1 for i, c in enumerate(normalized_df["Country"].unique())}
normalized_df["node_id"] = normalized_df["Country"].map(country_node_map)
def z_layer(role):
    return 20 if role == "Torchbearer" else 0 if role == "Beneficiary" else 10
normalized_df["z_layer"] = normalized_df["Role"].apply(z_layer)

In [58]:
normalized_df

Unnamed: 0,Country,Indicator,Indicator_Type,SeriesCode,Units,Role,country_code,latitude,longitude,year,Value,year_interval,normalized_value,ring_location,node_id,z_layer
0,Afghanistan,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,Not Assigned,AF,33.831137,66.024712,2000,0.00,2000-2005,0.010000,-135,1,10
1,Albania,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,Not Assigned,AL,41.134553,20.064206,2000,12.41,2000-2005,0.085107,-135,2,10
2,Algeria,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,Not Assigned,DZ,28.144114,2.679966,2000,0.00,2000-2005,0.010000,-135,3,10
3,American Samoa,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,Not Assigned,AS,-14.300688,-170.718116,2000,0.00,2000-2005,0.010000,-135,4,10
4,Andorra,16.1.1,Homicide,VC_IHR_PSRC,PER_100000_POP,Not Assigned,AD,42.545303,1.576286,2000,0.00,2000-2005,0.010000,-135,5,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39571,Western Africa,6.a.1,Total official development assistance (gross d...,DC_TOF_WASHL,CON_USD_M,Not Assigned,REGION,18.312810,-4.833980,2023,0.00,2018-2023,0.010000,-30,267,10
39572,Western Asia,6.a.1,Total official development assistance (gross d...,DC_TOF_WASHL,CON_USD_M,Not Assigned,REGION,25.839450,43.242190,2023,0.00,2018-2023,0.010000,-30,268,10
39573,Yemen,6.a.1,Total official development assistance (gross d...,DC_TOF_WASHL,CON_USD_M,Not Assigned,YE,15.905206,47.593953,2023,0.00,2018-2023,0.010000,-30,203,10
39574,Zambia,6.a.1,Total official development assistance (gross d...,DC_TOF_WASHL,CON_USD_M,Not Assigned,ZM,-14.468804,28.767973,2023,0.00,2018-2023,0.010000,-30,204,10


In [None]:
# # Remove rows with missing node_id
# normalized_df = normalized_df[normalized_df["node_id"].notnull()].copy()

# # Remove any rows with missing country or year_interval or Indicator (needed for hierarchy)
# normalized_df = normalized_df[normalized_df[["Country", "year_interval", "Indicator"]].notnull().all(axis=1)].copy()

# # === FIX FOR MISSING PARENTS ===
# # Step 1: Remove any rows that are exact duplicates
# normalized_df = normalized_df.drop_duplicates()

# # Step 2: Get all valid (Country, year_interval) pairs — i.e., ring parents
# valid_rings = set(normalized_df.groupby(["Country", "year_interval"]).size().index)

# # Step 3: Drop rows that reference rings that don't exist (missing ring parents)
# normalized_df = normalized_df[normalized_df.apply(
#     lambda row: (row["Country"], row["year_interval"]) in valid_rings, axis=1
# )].copy()

# # Step 4: Get all valid (Country, year_interval, Indicator) — i.e., petal children
# valid_petals = set(normalized_df.groupby(["Country", "year_interval", "Indicator"]).size().index)

# # Step 5: Drop any rows that reference indicator-level nodes without valid parents
# normalized_df = normalized_df[normalized_df.apply(
#     lambda row: (row["Country"], row["year_interval"], row["Indicator"]) in valid_petals, axis=1
# )].copy()

# print(normalized_df)

              Country Indicator  \
0         Afghanistan    16.1.1   
1             Albania    16.1.1   
2             Algeria    16.1.1   
3      American Samoa    16.1.1   
4             Andorra    16.1.1   
...               ...       ...   
39571  Western Africa     6.a.1   
39572    Western Asia     6.a.1   
39573           Yemen     6.a.1   
39574          Zambia     6.a.1   
39575        Zimbabwe     6.a.1   

                                          Indicator_Type    SeriesCode  \
0                                               Homicide   VC_IHR_PSRC   
1                                               Homicide   VC_IHR_PSRC   
2                                               Homicide   VC_IHR_PSRC   
3                                               Homicide   VC_IHR_PSRC   
4                                               Homicide   VC_IHR_PSRC   
...                                                  ...           ...   
39571  Total official development assistance (gross d...  DC_

In [59]:
# SDG Color Map
sdg_color_map = {
    '6.3.1': [0, 255, 255], '6.3.2': [152, 0, 255], '6.5.1': [0, 255, 255], '6.a.1': [255, 0, 255],
    '16.1.1': [100, 0, 255], '16.3.2': [0, 100, 255], '16.5.1': [255, 100, 0], '16.6.1': [0, 255, 200],
    '17.3.1': [0, 152, 255]
}

ring_id = normalized_df["node_id"].max() + 1
np_tag_id_counter = ring_id + 100000

In [60]:
# === FUNCTIONS ===

def create_country_node_tag(node_id, lat, long, z, colors, title, description):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[0].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = node_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = node_id
    pin.update({
        'np_table_id': 1, 'parent_id': 0, 'scale_x': 0.5, 'scale_y': 0.5, 'scale_z': 0.5,
        'translate_x': long, 'translate_y': lat, 'translate_z': z, 'np_geometry_id': 19,
        'np_topo_id': 6, 'np_color_id': 1, 'color_r': colors[0], 'color_g': colors[1], 'color_b': colors[2]
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = node_id
    tag.update({'table_id': 1, 'title': title, 'description': description})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)

def create_year_node_tag(ring_location, parent_id, ring_id, title):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[1].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = ring_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = ring_id
    pin.update({
        'parent_id': parent_id, 'branch_level': 2, 'translate_x': ring_location,
        'scale_x': 0.5, 'scale_y': 0.5, 'scale_z': 0.5, 'np_table_id': 1, 'np_color_id': 20,
        'color_r': 55, 'color_g': 190, 'color_b': 190
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = ring_id
    tag.update({'table_id': 1, 'title': title, 'description': 'Year Ring'})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)

def create_petal_rings(petal_id, parent_id, location, title, colors):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[1].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = petal_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = petal_id
    pin["np_data_id"] = petal_id
    pin.update({
        'parent_id': parent_id, 'branch_level': 3, 'translate_x': location,
        'scale_x': 1, 'scale_y': 1, 'scale_z': 1, 'np_table_id': 1, 'np_topo_id': 3,
        'ratio': 0.1, 'color_r': colors[0], 'color_g': colors[1], 'color_b': colors[2]
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = petal_id
    tag.update({'table_id': 1, 'title': title, 'description': 'SDG'})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)

def link_nodes(link_id, parent_id, child_id, title, colors, ratio):
    global node_df, tag_df, np_tag_id_counter
    pin = node_template.iloc[0].copy()
    tag = tag_template.iloc[0].copy()
    np_tag_id_counter += 1
    pin["np_node_id"] = link_id
    pin["np_tag_id"] = np_tag_id_counter
    pin["record_id"] = link_id
    pin.update({
        'np_table_id': 1, 'np_geometry_id': 3, 'np_topo_id': 6, 'np_color_id': 20,
        'ratio': ratio, 'parent_id': parent_id, 'child_id': child_id, 'type': 7,
        'color_r': colors[0], 'color_g': colors[1], 'color_b': colors[2]
    })
    tag["np_tag_id"] = np_tag_id_counter
    tag["record_id"] = link_id
    tag.update({'table_id': 1, 'title': title, 'description': title})
    node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
    tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)


In [61]:
# === GENERATION ===
donor_year_dict = {}
recipient_year_dict = {}
donor_countries_set = set()
recipient_countries_set = set()

for i, row in normalized_df.iterrows():
    country = row["Country"]
    role = row["Role"]
    interval = row["year_interval"]
    node_id = row["node_id"]
    lat = row["latitude"]
    lon = row["longitude"]
    z = row["z_layer"]
    indicator = row["Indicator"]
    ring_location = row["ring_location"]

    if role == "Torchbearer":
        if country not in donor_countries_set:
            create_country_node_tag(node_id, lat, lon, z, [0, 225, 0], country, "Donor Country")
            donor_countries_set.add(country)
        if country not in donor_year_dict:
            donor_year_dict[country] = {}
        if interval not in donor_year_dict[country]:
            create_year_node_tag(ring_location, node_id, ring_id, f"{interval} (Donor)")
            petal_id = ring_id
            location = -45
            donor_year_dict[country][interval] = []
            for _, g in normalized_df[(normalized_df["Country"] == country) & (normalized_df["year_interval"] == interval)].groupby("Indicator"):
                petal_id += 1
                location += 30
                color = sdg_color_map.get(g["Indicator"].iloc[0], [0, 0, 0])
                create_petal_rings(petal_id, ring_id, location, g["Indicator"].iloc[0], color)
                donor_year_dict[country][interval].append({g["Indicator"].iloc[0]: petal_id})
            ring_id = petal_id + 1

    elif role == "Beneficiary":
        if country not in recipient_countries_set:
            create_country_node_tag(node_id, lat, lon, z, [225, 0, 0], country, "Recipient Country")
            recipient_countries_set.add(country)
        if country not in recipient_year_dict:
            recipient_year_dict[country] = {}
        if interval not in recipient_year_dict[country]:
            create_year_node_tag(ring_location, node_id, ring_id, f"{interval} (Recipient)")
            petal_id = ring_id
            location = -45
            recipient_year_dict[country][interval] = []
            for _, g in normalized_df[(normalized_df["Country"] == country) & (normalized_df["year_interval"] == interval)].groupby("Indicator"):
                petal_id += 1
                location += 30
                color = sdg_color_map.get(g["Indicator"].iloc[0], [0, 0, 0])
                create_petal_rings(petal_id, ring_id, location, g["Indicator"].iloc[0], color)
                recipient_year_dict[country][interval].append({g["Indicator"].iloc[0]: petal_id})
            ring_id = petal_id + 1

  tag.update({'table_id': 1, 'title': title, 'description': description})
  node_df = pd.concat([node_df, pd.DataFrame([pin])], ignore_index=True)
  tag_df = pd.concat([tag_df, pd.DataFrame([tag])], ignore_index=True)
  tag.update({'table_id': 1, 'title': title, 'description': 'Year Ring'})
  tag.update({'table_id': 1, 'title': title, 'description': 'SDG'})
  tag.update({'table_id': 1, 'title': title, 'description': 'SDG'})
  tag.update({'table_id': 1, 'title': title, 'description': 'SDG'})
  tag.update({'table_id': 1, 'title': title, 'description': 'SDG'})
  tag.update({'table_id': 1, 'title': title, 'description': 'SDG'})
  tag.update({'table_id': 1, 'title': title, 'description': 'SDG'})
  tag.update({'table_id': 1, 'title': title, 'description': 'SDG'})
  tag.update({'table_id': 1, 'title': title, 'description': description})
  tag.update({'table_id': 1, 'title': title, 'description': 'Year Ring'})
  tag.update({'table_id': 1, 'title': title, 'description': 'SDG'})
  tag.update({'t

In [None]:
# valid_ids = set(node_df["np_node_id"])

# if d_entry[indicator] not in valid_ids or r_entry[indicator] not in valid_ids:
#     print(f"❗ Invalid link — Donor: {d_entry[indicator]}, Recipient: {r_entry[indicator]}")


In [62]:
# === LINK CREATION FIXED ===
link_id = ring_id + 1
processed_links = 0
max_links = 20000

# Build fresh donor and recipient dictionaries from current node_df
donor_year_dict = {}
recipient_year_dict = {}
for _, row in normalized_df.iterrows():
    country = row["Country"]
    role = row["Role"]
    interval = row["year_interval"]
    indicator = row["Indicator"]
    node_id = row["node_id"]
    if node_id not in node_df["np_node_id"].values:
        continue
    target_dict = donor_year_dict if role == "Torchbearer" else recipient_year_dict
    if country not in target_dict:
        target_dict[country] = {}
    if interval not in target_dict[country]:
        target_dict[country][interval] = []
    target_dict[country][interval].append({indicator: node_id})

valid_ids = set(node_df["np_node_id"])

for i, row in normalized_df.iterrows():
    if processed_links >= max_links:
        break
    interval = row["year_interval"]
    indicator = row["Indicator"]
    value = row["normalized_value"]
    for d_country, d_data in donor_year_dict.items():
        if interval not in d_data:
            continue
        for d_entry in d_data[interval]:
            d_id = d_entry.get(indicator)
            if not d_id or d_id not in valid_ids:
                continue
            for r_country, r_data in recipient_year_dict.items():
                if interval not in r_data:
                    continue
                for r_entry in r_data[interval]:
                    r_id = r_entry.get(indicator)
                    if not r_id or r_id not in valid_ids:
                        continue
                    link_nodes(
                        link_id,
                        parent_id=d_id,
                        child_id=r_id,
                        title=f"{indicator} | {d_country} → {r_country}",
                        colors=sdg_color_map.get(indicator, [0, 0, 0]),
                        ratio=value
                    )
                    link_id += 1
                    processed_links += 1


  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'description': title})
  tag.update({'table_id': 1, 'title': title, 'de

In [63]:
# Final cleanup
# node_df = node_df.iloc[2:].copy()
# tag_df = tag_df.iloc[1:].copy()

# Final cleanup: robust filtering
node_df = node_df[node_df["np_node_id"].notnull()].copy()
tag_df = tag_df[tag_df["np_tag_id"].notnull()].copy()

node_ints = [
    'np_node_id','type','np_data_id','selected','parent_id','branch_level','child_id','np_tag_id','np_palette_id','np_ch_in_id','np_ch_out_id','ch_sync_time',
    'np_palette_id_alt','np_color_id_alt', 'np_material_id','np_geometry_id', 'np_color_id', 'color_fade','np_texture_id','hide','freeze','np_topo_id',
    'subspace','trigger_hi_x','trigger_hi_y','trigger_hi_z','trigger_lo_x','trigger_lo_y','trigger_lo_z', 'proximity_x','proximity_y','proximity_z',
    'proximity_mode_x','proximity_mode_y','proximity_mode_z','segments_x','segments_y','segments_z','tag_mode','np_format_id','np_table_id','size'
]
node_df[node_ints] = node_df[node_ints].astype(int)
node_df['record_id'] = node_df['record_id'].astype('int64')
tag_df[["np_tag_id", "record_id"]] = tag_df[["np_tag_id", "record_id"]].astype(int)


In [64]:
# Save to CSV
time_stamp = time.strftime("%Y%m%d_%H%M%S")
node_df.to_csv(f"C:/Users/rohit/OneDrive/Documents/DAEN690/Final_Code-Files/np_node_SDG61617_output_{time_stamp}_proto13.csv", index=False)
tag_df.to_csv(f"C:/Users/rohit/OneDrive/Documents/DAEN690/Final_Code-Files/np_tag_SDG61617_output_{time_stamp}_proto13.csv", index=False)