# Plotting locations in Nordisk Familjebok

In [None]:
import os
os.chdir('../../')
print(os.getcwd())

import matplotlib.pyplot as plt
# from mpl_toolkits.basemap import Basemap
#import scripts.coordinates_retreival as gs
# import json
# import time
from utils import json_helpers as jh
from utils.paths import *
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import random
import reverse_geocode
import pycountry
import pycountry_convert as pc

e1 = f'{ENCYCLOPEDIAS_JSON_FOLDER}/e1'
e2 = f'{ENCYCLOPEDIAS_JSON_FOLDER}/e2'

In [None]:
def get_edition_locations(edition: str):
    entries = jh.read_items(f'{ENCYCLOPEDIAS_JSON_FOLDER}/{edition}')

    # Remove entries with coordinates None
    locations = [entry for entry in entries if entry['latitude'] != None and entry['longitude'] != None]
    return locations

### Functions helpful for comparision of editions

In [None]:
# Returns all coordinates to articles in an edition
def get_all_coords(edition: list[dict]):
    return [(entry.get('latitude', None), (entry.get('longitude', None))) for entry in edition]

# Returns the location entries in both editions
def coords_union(edition1: list[dict], edition2: list[dict]) -> list[dict]:
    coords1 = get_all_coords(edition1)
    coords2 = get_all_coords(edition2)
    
    union_coords = set(coords1 + (coords2))
    union_coords.discard((None, None))
    
    qids = set()
    entry_union = []
    for entry in edition1 + edition2:
        lat = entry.get('latitude', None)
        lon = entry.get('longitude', None)
        qid = entry['qid']
        if (lat, lon) in union_coords and qid not in qids:
            entry_union.append(entry)

    return entry_union

# Returns the location entries in edition1 but not in edition2
def coords_diff(edition1: list[dict], edition2: list[dict]) -> list[dict]:
    coords1 = set(get_all_coords(edition1))
    coords2 = set(get_all_coords(edition2))

    diff_coords = coords1.difference(coords2)
    diff_coords.discard((None, None))

    qids = set()
    entry_diff = []
    for entry in edition1:
        lat = entry.get('latitude', None)
        lon = entry.get('longitude', None)
        qid = entry['qid']
        if (lat, lon) in diff_coords and qid not in qids:
            diff_coords.discard((lat, lon))
            entry_diff.append(entry)

    return entry_diff

def coords_intersec(edition1: list[dict], edition2: list[dict]) -> list[dict]:
    coords1 = set(get_all_coords(edition1))
    coords2 = set(get_all_coords(edition2))

    intersec_coords = coords1.intersection(coords2)
    intersec_coords.discard((None, None))


    qids = set()
    entry_intersec = []
    for entry in edition1 + edition2:
        lat = entry.get('latitude', None)
        lon = entry.get('longitude', None)
        qid = entry['qid']
        if (lat, lon) in intersec_coords and qid not in qids:
            qids.add(entry['qid'])
            entry_intersec.append(entry)

    return entry_intersec

### Visualization functions

In [None]:
def twod_map_coords(title: str = "", outname: str = "", edition1: list[dict] = [], edition2: list[dict] = [], samples: int = 0):
    # Add description for legend
    for entry in edition1:
        entry['description'] = 'Edition 1'  # Description for legend
    for entry in edition2:
        entry['description'] = 'Edition 2'  # Description for legend

    # Decide what datapoints to plot
    if samples != 0:
        all_data = random.sample(edition1 + edition2, samples)
    else:
        all_data = edition1 + edition2  # Don't shuffle here, we'll do it later

    # Create DataFrame for all data
    dataframe = pd.DataFrame(all_data)

    # Select only the columns used in the plot
    columns_to_use = ['latitude', 'longitude', 'qid', 'text', 'description']
    for col in columns_to_use:
        if col not in dataframe.columns:
            dataframe[col] = None

    # Shuffle the DataFrame
    dataframe = dataframe.sample(frac=1).reset_index(drop=True)

    # Create legend traces
    if edition1 and edition2:
        legend_traces = [
            go.Scattergeo(
                lat=[None], lon=[None], mode='markers',
                marker=dict(size=6, color='blue', opacity=1),
                name='First edition (blue)'
            ),
            go.Scattergeo(
                lat=[None], lon=[None], mode='markers',
                marker=dict(size=6, color='red', opacity=1),
                name='Second edition (red)'
            )
        ]
    else:
        legend_traces = []

    # Create traces for each data point
    data_traces = []
    for _, row in dataframe.iterrows():
        data_traces.append(
            go.Scattergeo(
                lat=[row['latitude']],
                lon=[row['longitude']],
                text=row['text'],
                marker=dict(
                    size=4,
                    color='blue' if row['description'] == 'Edition 1' else 'red',
                    opacity=0.3
                ),
                mode='markers',
                showlegend=False  # Do not show legend for individual points
            )
        )

    layout = go.Layout(
        title=dict(
            text=title,
            x=0.5,
            y=0.95,
            xanchor='center',
            yanchor='top',
            font=dict(size=36)  # Increase title font size
        ),
        legend=dict(
            x=0.98, 
            y=1,
            xanchor='right',
            yanchor='top',
            font=dict(size=20)  # Increase legend font size
        ),
        geo=dict(
            projection=dict(type='natural earth')
        ),
        hoverlabel=dict(
            bgcolor="white",
            font_size=12,
            font_family="Rockwell"
        )
    )

    fig = go.Figure(data=legend_traces + data_traces, layout=layout)

    fig.write_html(f"{LOCATION_PLOTS_FOLDER}/2d_plot_{outname}.html")
    #fig.show()


def threed_map_coords(config):
    FILENAME_OUT_CSV = config["coords_fetch"]["output_csv_file"]

    lons, lats = [], []

    with open(FILENAME_OUT_CSV) as f:
        for line in f.readlines():
            lon, lat = line.split(',')

            # print(lon, lat)
            lons.append(float(lon))
            lats.append(float(lat))

    print(f"Successful coords: {len(lons)}")

    # if you are passing just one lat and lon, put it within "[]"
    # editing the marker
    fig = go.Figure(go.Scattergeo(lat=lats, lon=lons))
    # this projection_type = 'orthographic is the projection which return 3d globe map'
    fig.update_traces(marker={"opacity": 0.4, 'size': 5, "color": "blue"})
    # layout, exporting html and showing the plot
    fig.update_geos(projection_type="orthographic")
    fig.update_layout(width=800, height=800, margin={
                      "r": 0, "t": 0, "l": 0, "b": 0})
    fig.write_html("3d_plot.html")
    fig.show()

### Plotting the locations in the editions

In [None]:
e1_locations = get_edition_locations("e1")
e2_locations = get_edition_locations("e2")

# editions_union = coords_union(e1_locations, e2_locations) # Already plotted in normal plot
# e1_diff = coords_diff(e1_locations, e2_locations)
# e2_diff = coords_diff(e2_locations, e1_locations)
# editions_intersec = coords_intersec(e1_locations, e2_locations)

twod_map_coords(title="Locations in the first and second editions", outname="e1_and_e2", edition1=e1_locations, edition2=e2_locations)

# twod_map_coords(title="Locations in both editions", outname="e1_union_e2", edition1=editions_union) # Already plotted in normal plot
# twod_map_coords(title="Locations only in Edition 1", outname="e1_diff_e2", edition1=e1_diff)
# twod_map_coords(title="Locations only in Edition 2", outname="e2_diff_e1", edition1=e2_diff)
# twod_map_coords(title="Locations present in both Edition 1 and 2", outname="e1_inter_e2", edition1=editions_intersec)

## Mapping coordinates to country and continent

In [None]:
# Function to get country name from coordinates
def get_country(lat, lon):
    coordinates = [(lat, lon)]
    result = reverse_geocode.search(coordinates)[0]
    country = result['country']
    return country

# Function to get the continent name from country code
def get_continent(country_name):
    country = pycountry.countries.get(name=country_name)
    if not country:
        return None
    country_alpha2 = country.alpha_2
    try:
        continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    except KeyError:
        return None

def country_continent_counts(edition:str):
    locations = get_edition_locations(edition)
    country_counts = {}
    continent_counts = {}
    for location in locations:
        country = get_country(location['latitude'], location['longitude'])
        continent = get_continent(country)
        
        if country is None:
           continue
        
        # Add to countries
        if country in country_counts:
            country_counts[country] += 1
        else:
            country_counts[country] = 1
        
        if continent is None:
           continue

        # Add to continents
        if continent in continent_counts:
            continent_counts[continent] += 1
        else:
            continent_counts[continent] = 1

    return country_counts, continent_counts

def region_percents(region_counts, regions): 
    total_count = sum(region_counts.values())
    return {region: 100 * (region_counts.get(region, 0)/total_count) for region in regions}

def region_percents_diff(region_percents1, region_percents2): # Must have same keys
    return {region: (percent - region_percents1[region]) for region, percent in region_percents2.items()}

### Applying functions to the editions

In [None]:
e1_country_counts, e1_continent_counts = country_continent_counts("e1")
e2_country_counts, e2_continent_counts = country_continent_counts("e2")

countries = set(e1_country_counts.keys()).union(set(e2_country_counts.keys()))

e1_country_percents = region_percents(e1_country_counts, countries)
e2_country_percents = region_percents(e2_country_counts, countries)

country_percents_diff = region_percents_diff(e1_country_percents, e2_country_percents)

In [None]:
print(len(e1_locations))
print(len(e2_locations))


### The Statistics

In [None]:
# Define a color map for the continents
CONTINENT_COLORS = {
    'Africa': '#ff7f0e',
    'Asia': '#d62728',
    'Europe': '#2ca02c',
    'North America': '#1f77b4',
    'Oceania': '#9467bd',
    'South America': '#8c564b',
    'Antarctica': '#e377c2'
}

# Function to plot a pie chart with percentages in a legend and consistent colors
def plot_pie_chart(counts, title, text_size=40):
    fig, ax = plt.subplots(figsize=(12, 12))
    
    # Sort by continent name to ensure consistent order
    sorted_counts = sorted(counts.items(), key=lambda item: item[0])
    
    # Unpack the keys and values
    keys = [item[0] for item in sorted_counts]
    values = [item[1] for item in sorted_counts]
    
    # Get the colors for the pie chart
    colors = [CONTINENT_COLORS[key] for key in keys]
    
    # Create the pie chart without labels
    wedges, _ = ax.pie(values, startangle=140, colors=colors, textprops={'fontsize': text_size})
    
    # Create legend labels with percentages
    total = sum(values)
    legend_labels = [f'{key} ({value / total * 100:.1f}%)' for key, value in zip(keys, values)]
    
    # Add legend to the plot
    ax.legend(wedges, legend_labels, title="", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1), fontsize=text_size)
    
    # Set the title with increased font size
    ax.set_title(title, fontsize=text_size)
    
    plt.show()

# Generate bar charts for counts
def plot_bar_chart(counts, title, xlabel, ylabel, top_n, text_size=20):
    # Sort the counts dictionary by values in descending order
    sorted_counts = sorted(counts.items(), key=lambda item: item[1], reverse=True)
    # Select the top n items
    top_counts = sorted_counts[:top_n]
    
    # Unpack the keys and values
    keys = [item[0] for item in top_counts]
    values = [item[1] for item in top_counts]
    x = range(len(keys))
    
    fig, ax = plt.subplots(figsize=(16, 7))
    bars = ax.bar(x, values, tick_label=keys)
    ax.set_title(title, fontsize=text_size)
    ax.set_xlabel(xlabel, fontsize=text_size)
    ax.set_ylabel(ylabel, fontsize=text_size)
    
    # Set the font size of the tick labels
    ax.tick_params(axis='both', which='major', labelsize=text_size)
    
    # Rotate x-axis labels for better readability
    plt.xticks(rotation=30, ha='right', fontsize=text_size)
    
    plt.show()

In [None]:
plot_pie_chart(e1_continent_counts, "First edition - Distribution of locations by continent")
plot_pie_chart(e2_continent_counts, "Second edition - Distribution of locations by continent")

plot_bar_chart(e1_country_counts, "First edition - Number of locations by country", "Country", "Count", 10)
plot_bar_chart(e2_country_counts, "Second edition - Number of locations by country", "Country", "Count", 10)

plot_bar_chart(country_percents_diff, "Relative increase of locations by country", "Country", "Percentage Units", 10)
