# Global Research Keyword Analysis (Refined)

In [28]:
# Setup and Data Loading
import pandas as pd
import numpy as np
import folium
from folium import plugins
import ast
from collections import Counter
from IPython.display import IFrame, display
import re
import matplotlib.pyplot as plt
from matplotlib import cm, colors
import requests

# Load dataset and drop nulls in critical columns
df = pd.read_csv('../FINAL_ARXIV_2025_with_affiliations.csv')
df = df.dropna(subset=['affiliations', 'keywords', 'smart_keywords'])

coords_df = pd.read_csv('../world_coords.csv')

# GeoJSON URL for country boundaries
political_countries_url = "http://geojson.xyz/naturalearth-3.3.0/ne_50m_admin_0_countries.geojson"

plt.style.use('dark_background')


In [29]:
# Robust Country Extraction and Keyword Cleaning
def extract_country_from_affil(affil_str):
    if not isinstance(affil_str, str) or not affil_str:
        return None
    
    country_map = {
        'usa': 'United States',
        'united states': 'United States',
        'uk': 'United Kingdom',
        'united kingdom': 'United Kingdom',
        'china': 'China',
        "people's republic of china": 'China',
        'germany': 'Germany',
        'france': 'France',
        'italy': 'Italy',
        'india': 'India',
        'japan': 'Japan',
        'canada': 'Canada',
        'australia': 'Australia',
        'spain': 'Spain',
        'russia': 'Russia',
        'brazil': 'Brazil',
        'south korea': 'South Korea',
        'switzerland': 'Switzerland',
        'netherlands': 'Netherlands',
        'sweden': 'Sweden',
        'taiwan': 'Taiwan',
        'israel': 'Israel',
        'austria': 'Austria',
        'denmark': 'Denmark',
        'belgium': 'Belgium',
        'finland': 'Finland',
        'norway': 'Norway',
        'poland': 'Poland',
        'mexico': 'Mexico',
        'chile': 'Chile',
        'argentina': 'Argentina',
        'czech republic': 'Czech Republic',
        'turkey': 'Turkey',
        'greece': 'Greece',
        'portugal': 'Portugal',
        'singapore': 'Singapore',
        'south africa': 'South Africa',
        'hong kong': 'Hong Kong',
        'new zealand': 'New Zealand',
        'ireland': 'Ireland',
        'hungary': 'Hungary',
        'colombia': 'Colombia'
    }
    
    parts = [p.strip().lower() for p in affil_str.split(',')]
    for part in reversed(parts):
        clean_part = re.sub(r'[^a-zA-Z\s]', '', part).strip()
        if clean_part in country_map:
            return country_map[clean_part]
    
    affil_lower = affil_str.lower()
    for key, val in country_map.items():
        if key in affil_lower:
            return val
            
    return None

def process_affiliations(row):
    try:
        aff_raw = row['affiliations']
        affs = aff_raw.split(';') if ';' in aff_raw else [aff_raw]
        extracted_countries = []
        for aff in affs:
            country = extract_country_from_affil(aff)
            if country:
                extracted_countries.append(country)
        return list(set(extracted_countries))
    except:
        return []

def clean_keyword(k):
    # Remove things like "(573)" from "Galaxies (573)"
    s = re.sub(r'\s*\(\d+\)\s*$', '', str(k)).strip()
    # Filter out email addresses
    if '@' in s:
        return None
    return s

df['countries_extracted'] = df.apply(process_affiliations, axis=1)


In [30]:
# Aggregation and Coloring Functions
def get_country_top_keywords(df, col_name):
    country_keywords = {}
    for _, row in df.iterrows():
        try:
            countries = row['countries_extracted']
            # Parse keywords
            raw_keywords = ast.literal_eval(row[col_name])
            cleaned_keywords = [clean_keyword(k) for k in raw_keywords if k]
            cleaned_keywords = [k for k in cleaned_keywords if k] # Filter None
            
            if not countries or not cleaned_keywords: continue
            
            for country in countries:
                if country not in country_keywords:
                    country_keywords[country] = Counter()
                country_keywords[country].update(cleaned_keywords)
        except:
            continue
            
    results = {}
    all_top_keywords = set()
    for country, counts in country_keywords.items():
        most_common = counts.most_common(5)
        if most_common:
            results[country] = most_common
            all_top_keywords.add(most_common[0][0])
            
    return results, sorted(list(all_top_keywords))

def create_legend_html(keyword_colors, title):
    legend_html = f'''
    <div style="position: fixed; bottom: 50px; left: 50px; width: 250px; height: auto; 
                border:2px solid grey; z-index:9999; font-size:12px;
                background-color:white; opacity: 0.9; padding: 10px; max-height: 400px; overflow-y: auto;">
    <b>{title}</b><br>
    '''
    for kw, color in keyword_colors.items():
        legend_html += f'<i style="background:{color}; width:12px; height:12px; float:left; margin-right:5px; border: 1px solid black;"></i>{kw}<br>'
    legend_html += '</div>'
    return legend_html

def generate_keyword_map(data_dict, unique_keywords, title, filename):
    m = folium.Map(location=[20, 0], zoom_start=2.3, tiles='cartodb positron')
    
    # Create colormap using 'jet'
    jet = cm.get_cmap('jet', len(unique_keywords))
    keyword_colors = {kw: colors.to_hex(jet(i)) for i, kw in enumerate(unique_keywords)}
    
    # GeoJSON with colors
    def style_function(feature):
        country_name = feature['properties']['name']
        mapping = {
            'United States': 'United States', 'United States of America': 'United States',
            'United Kingdom': 'United Kingdom', 'China': 'China',
            'Czech Rep.': 'Czech Republic', 'Dem. Rep. Korea': 'South Korea',
            'Korea': 'South Korea'
        }
        std_name = mapping.get(country_name, country_name)
        
        kw_info = data_dict.get(std_name)
        if kw_info:
            top_kw = kw_info[0][0]
            return {
                'fillColor': keyword_colors.get(top_kw, '#ffffff'),
                'color': 'black',
                'weight': 1,
                'fillOpacity': 0.6
            }
        return {
            'fillColor': '#ffffff',
            'color': 'black',
            'weight': 1,
            'fillOpacity': 0.1
        }

    folium.GeoJson(
        political_countries_url,
        style_function=style_function,
        tooltip=folium.GeoJsonTooltip(fields=['name'], aliases=['Country:'])
    ).add_to(m)
    
    # Add Markers (Circles Only, No Text Labels)
    for country, top_kws in data_dict.items():
        if country in coords_df['country'].values:
            lat = coords_df[coords_df['country'] == country]['latitude'].values[0]
            lon = coords_df[coords_df['country'] == country]['longitude'].values[0]
            
            top_kw = top_kws[0][0]
            
            # folium.CircleMarker(
            #     location=[lat, lon],
            #     radius=4,
            #     color='black',
            #     weight=1,
            #     fill=True,
            #     fill_color=keyword_colors.get(top_kw, 'white'),
            #     fill_opacity=1,
            #     popup=f"<b>{country}</b><br>Top Keyword: {top_kw}<br>Total Keywords:<br>" + "<br>".join([f"{k}: {c}" for k, c in top_kws])
            # ).add_to(m)

    # Add Legend
    m.get_root().html.add_child(folium.Element(create_legend_html(keyword_colors, title)))
    
    m.save(filename)
    display(IFrame(src=filename, width='100%', height='600px'))


## Map 1: Top Keywords per Country (All Authors)

In [31]:
# Generate Map 1: Most Frequent Keywords
print("Processing keywords column...")
kw_results, unique_kws = get_country_top_keywords(df, 'keywords')
generate_keyword_map(kw_results, unique_kws, "Top Research Keywords", "global_keywords_map.html")


Processing keywords column...


  jet = cm.get_cmap('jet', len(unique_keywords))


## Map 2: Top Smart Keywords per Country (All Authors)

In [32]:
# Generate Map 2: Most Frequent Smart Keywords
print("Processing smart_keywords column...")
skw_results, unique_skws = get_country_top_keywords(df, 'smart_keywords')
generate_keyword_map(skw_results, unique_skws, "Top Smart Keywords", "global_smart_keywords_map.html")


Processing smart_keywords column...


  jet = cm.get_cmap('jet', len(unique_keywords))
