In [None]:
from collections import defaultdict
import networkx as nx
from networkx import NetworkXError
import pandas as pd
from pyvis.network import Network
from typing import Any
from edge import Edge
from graph_utils import add_dynamic_legend, add_dynamic_legend_to_component_graph
import settings
import json
import random




## Load Dataframe

In [1]:
pd.set_option("display.max_rows", 20)
pd.set_option('display.max_colwidth', None)

df = pd.read_parquet("resources/cleaned_dataframe.parquet", engine="pyarrow")
df

NameError: name 'pd' is not defined

## Filter Dataframe 
### Keep languages with at least 40 translations and keep only translations that appear more than once.

In [None]:
# Count trasnaltions per language
language_counts = df['word_and_gloss'].value_counts()

# filter languages with at least 40 translations
languages_to_keep = language_counts[language_counts >= settings.MIN_LANGUAGE_COUNT].index

# keep rows with filteres languages
df_filtered = df[df['word_and_gloss'].isin(languages_to_keep)]

df_filtered.reset_index(drop = True, inplace=True)
df_filtered

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) unconstrained.;;not imprisoned or enslaved.,afrikaans;;vrye;;adj
1,free;;(social) unconstrained.;;not imprisoned or enslaved.,albanian;;lirë (i/e);;adj
2,free;;(social) unconstrained.;;not imprisoned or enslaved.,amharic;;ነፃ;;adj
3,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حُرّ;;adj
4,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حر;;adj
...,...,...
550348,call the fire department;;(us) call the emergency service that specializes in extinguishing fires.,tagalog;;tawag ka ng bumbero;;phrase
550349,call the fire department;;(us) call the emergency service that specializes in extinguishing fires.,turkish;;itfaiyeyi çağırin;;phrase
550350,call the fire department;;(us) call the emergency service that specializes in extinguishing fires.,turkish;;itfaiyeyi çağır;;phrase
550351,call the fire department;;(us) call the emergency service that specializes in extinguishing fires.,ukrainian;;ви́кличте поже́жників;;phrase


In [None]:
translation_counts = df_filtered["language_and_translation"].value_counts()

# filter translations that appear more than once
translations_to_keep = translation_counts[translation_counts > 1].index

df_filtered = df_filtered[df_filtered['language_and_translation'].isin(translations_to_keep)]

df_filtered.reset_index(drop = True, inplace=True)
df_filtered

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حُرّ;;adj
1,free;;(social) unconstrained.;;not imprisoned or enslaved.,armenian;;ազատ;;adj
2,free;;(social) unconstrained.;;not imprisoned or enslaved.,assamese;;মুকলি;;adj
3,free;;(social) unconstrained.;;not imprisoned or enslaved.,asturian;;llibre;;adj
4,free;;(social) unconstrained.;;not imprisoned or enslaved.,azerbaijani;;azad;;adj
...,...,...
119553,"shaitan;;(islam) iblis, satan.",turkish;;şeytan;;name
119554,"shaitan;;(islam) iblis, satan.",turkmen;;şeýtan;;name
119555,"shaitan;;(islam) iblis, satan.",urdu;;شَیطان;;name
119556,"shaitan;;(islam) iblis, satan.",uyghur;;شەيتان;;name


In [None]:
unique_lang_and_trans = df_filtered["language_and_translation"].unique()
unique_lang_and_trans

array(['arabic;;حُرّ;;adj', 'armenian;;ազատ;;adj', 'assamese;;মুকলি;;adj',
       ..., 'spanish;;aguar;;verb', 'swedish;;vattna;;verb',
       'portuguese;;dígito;;noun'], dtype=object)

In [None]:
df_filtered["language_and_translation"].nunique()

51999

## Create a Concept Map where each concept has their correspoinding Language, Translation and POS.
## Create a Concept List with all Concepts.

In [None]:
concept_map = dict()
concept_list = set()
def add_concept_to_translation(translation: str, concept: str) -> None:
    concept_list.add(concept)
    if not concept in concept_map:
        concept_map[concept] = set()
    concept_map[concept].add(translation)

df_filtered.apply(lambda row: add_concept_to_translation(row["language_and_translation"], row["word_and_gloss"]), axis=1)

maximum = 0
for key, value in concept_map.items():
    if len(value) > maximum:
        maximum = len(value)
        print(f"{maximum:2d} | {key:32s} | {value}")

59 | free;;(social) unconstrained.;;not imprisoned or enslaved. | {'old irish;;sóer;;adj', 'zazaki;;azad (diq);;adj', 'norman;;libre;;adj', 'yiddish;;פֿרײַ;;adj', 'icelandic;;frjáls;;adj', 'persian;;آزاد;;adj', 'latvian;;brīvs;;adj', 'portuguese;;livre;;adj', 'czech;;svobodný;;adj', 'french;;libre;;adj', 'galician;;libre;;adj', 'hungarian;;szabad;;adj', 'limburgish;;vrie;;adj', 'esperanto;;libera;;adj', 'dutch;;vrij;;adj', 'turkish;;özgür;;adj', 'old english;;frēo;;adj', 'asturian;;llibre;;adj', 'swedish;;fri;;adj', 'northern kurdish;;serbest;;adj', 'finnish;;vapaa;;adj', 'czech;;volný;;adj', 'slovak;;slobodný;;adj', 'hebrew;;חופשי;;adj', 'indonesian;;bebas;;adj', 'korean;;자유;;adj', 'spanish;;libre;;adj', 'greek;;ελεύθερος;;adj', 'macedonian;;слободен;;adj', 'german;;frei;;adj', 'romanian;;liber;;adj', 'assamese;;মুকলি;;adj', 'russian;;свобо́дный;;adj', 'japanese;;自由;;adj', 'occitan;;liure;;adj', 'danish;;fri;;adj', 'catalan;;lliure;;adj', 'hebrew;;חָפְשִׁי;;adj', 'malay;;bebas;;adj', 

### We have 5315 concepts.

In [None]:
len(concept_list)

5315

### Show first entry of the Concept Map.

In [None]:
first_concept = dict(list(concept_map.items())[:1])
first_concept

{'free;;(social) unconstrained.;;not imprisoned or enslaved.': {'arabic;;حُرّ;;adj',
  'armenian;;ազատ;;adj',
  'assamese;;মুকলি;;adj',
  'asturian;;llibre;;adj',
  'azerbaijani;;azad;;adj',
  'bambara;;hɔrɔn;;adj',
  'bashkir;;ирекле;;adj',
  'catalan;;lliure;;adj',
  'chinese mandarin;;自由的;;adj',
  'czech;;svobodný;;adj',
  'czech;;volný;;adj',
  'danish;;fri;;adj',
  'dutch;;vrij;;adj',
  'esperanto;;libera;;adj',
  'estonian;;vaba;;adj',
  'finnish;;vapaa;;adj',
  'french;;libre;;adj',
  'galician;;libre;;adj',
  'german;;frei;;adj',
  'greek;;ελεύθερος;;adj',
  'hebrew;;חָפְשִׁי;;adj',
  'hebrew;;חופשי;;adj',
  'hungarian;;szabad;;adj',
  'icelandic;;frjáls;;adj',
  'ido;;libera;;adj',
  'indonesian;;bebas;;adj',
  'interlingua;;libere;;adj',
  'irish;;saor;;adj',
  'italian;;libero;;adj',
  'japanese;;自由;;adj',
  'korean;;자유;;adj',
  'latvian;;brīvs;;adj',
  'limburgish;;vrie;;adj',
  'low german;;free;;adj',
  'macedonian;;слободен;;adj',
  'malay;;bebas;;adj',
  'malayalam;;സ്വ

In [None]:
G = nx.Graph()

# Takes about 30 seconds on my machine
edge_map = dict() 
maximum = 0
for i, concept1 in enumerate(concept_list):
    for concept2 in concept_list:
        if concept1 == concept2:
            continue
        
        pair = tuple(sorted([concept1, concept2]))
        if intersection := concept_map[concept1].intersection(concept_map[concept2]):
            edge_map[pair] = Edge(len(intersection), intersection)
            if len(intersection) > maximum:
                maximum = len(intersection)

# Add edges to the graph with weights
for pair, edge in edge_map.items():
    G.add_edge(pair[0], pair[1], weight=edge.weight)

edges = G.edges(data=True)
weights = [edge.weight for edge in edge_map.values()]


In [None]:
# Meaningful comment

sorted_edges = sorted(edges, key = lambda edge: edge[2]['weight'], reverse=True)
print("Strongest connections:")
for parent1, parent2, edge_data in sorted_edges[:5]:
    print(f"(weight: {edge_data['weight']}) {parent1} -- {parent2}")

print("\nWeakest connections:")
for parent1, parent2, edge_data in sorted_edges[-5:]:
    print(f"(weight: {edge_data['weight']}) {parent1} -- {parent2}")

Strongest connections:
(weight: 137) football;;(uk, africa, caribbean, south asia, uncountable) association football, also called soccer: a game in which two teams each contend to get a round ball into the other team's goal primarily by kicking the ball. -- soccer;;(originated, late 19th c, now often us, australia, ireland, philippines, and other countries; see usage notes) association football.
(weight: 129) disease;;(medicine) an abnormal condition of a human, animal or plant that causes discomfort or dysfunction; distinct from injury insofar as the latter is usually instantaneously acquired. -- illness;;(countable) an instance of a disease or poor health.
(weight: 122) speak;;(intransitive) to communicate with one's voice, to say words out loud. -- talk;;(intransitive) to communicate, usually by means of speech.
(weight: 116) soil;;(uncountable) a mixture of mineral particles and organic material, used to support plant growth. -- earth;;(uncountable) soil.
(weight: 115) hide;;(trans

In [None]:
def create_graph(focus_concept: str) -> None:
    """
    Creates an interactive network and saves it as a HTML file.
    
    Args:
        focus_concept (str): The concept for which the network should be created.
        For Example: "free;;(social) unconstrained.;;not imprisoned or enslaved."
    """
    ## Validate input TODO
    try:
        # Get all neighbors of the focus concept
        neighbors = list(G.neighbors(focus_concept))
    except NetworkXError:
        print(f"Invalid focus_concept={focus_concept}. Dataset does not contain this concept.")
        return

    # Create a pyvis network
    net = Network(notebook=False, height="750px", width="100%", bgcolor="#222222", font_color="white")

    # Create a subgraph containing the focus concept and its neighbors
    local_subgraph = G.subgraph([focus_concept] + neighbors)

    # Map edges to the languages that connect them
    edge_to_languages = defaultdict(tuple)

    subgraph_maximum = 0
    for pair in local_subgraph.edges():
        pair = tuple(sorted(list(pair)))
        edge = edge_map[pair]
        edge_to_languages[pair] = edge.value

        if edge.weight > subgraph_maximum:
            subgraph_maximum = edge.weight


    # Add nodes to the pyvis network
    for node in local_subgraph.nodes:
        net.add_node(node, title=node, label=node.split(settings.SEPERATOR)[0])

    # Add edges with language information
    for pair, languages in edge_to_languages.items():
        edge: Edge = edge_map[pair]
        value = edge.weight
        width = edge.normalized(subgraph_maximum)
        net.add_edge(*pair, value=value, languages=', '.join(languages), width=width)

    # Save graph to a temporary file
    temp_file = "temp_network.html"
    net.save_graph(temp_file)

    # Add dynamic legend to the HTML file
    add_dynamic_legend(temp_file, edge_to_languages, focus_concept)

    print(f"Saved html Network '{focus_concept}'.")

## Two Example Subgraphs

In [None]:
#create_graph("pyramid;;(geometry) a solid with triangular lateral faces and a polygonal (often square or rectangular) base.")
#create_graph("free;;(social) unconstrained.;;not imprisoned or enslaved.")

In [None]:
# Edge Threshold
weight_threshold = 5

# remove edges under threshold
edges_to_remove = [(u, v) for u, v, data in G.edges(data=True) if data['weight'] < weight_threshold]
G.remove_edges_from(edges_to_remove)

# Graph Depth 2
new_graph = nx.Graph()
for node in G.nodes():
    subgraph = nx.ego_graph(G, node, radius=2)
    new_graph.add_edges_from(subgraph.edges(data=True))

# Remove duplicate edges
new_graph = nx.Graph(new_graph)

# Seperate graph into zusammenhangskomponenten
components = list(nx.connected_components(new_graph))

# Create Graph for each component
def create_html(component, index):
    component_graph = new_graph.subgraph(component)

    net = Network(notebook=False, height="100vh", bgcolor="#222222", font_color="white")

    for node in component_graph.nodes:
        net.add_node(node, title=node, label=node.split(";;")[0])

    # Calculcate the global maximum of edge weights for the component
    try:
        component_maximum = max(
            edge_map[pair].weight for pair in component_graph.edges
            if pair in edge_map  # Check if edge exists in edge_map
        )
    except ValueError:
        # If no edges exist, set default component maximum on 1
        component_maximum = 1
        print(f"Component {index + 1} has no edges in edge_map. Use default value for component_maximum.")

    # Add edges with weights and Information
    edge_to_languages = defaultdict(tuple)
    for u, v, data in component_graph.edges(data=True):
        pair = tuple(sorted([u, v]))
        if pair in edge_map:   # Check if edge exists in edge_map
            edge = edge_map[pair]
            value = edge.weight
            languages = edge.value
            width = edge.normalized(component_maximum)  # Use max of the component for normalization
            net.add_edge(u, v, value=value, languages=', '.join(languages), width=width)
            edge_to_languages[pair] = edge.value

    # save graph in a html file
    output_file = f"component_graphs/component_{index + 1}_graph.html"
    net.save_graph(output_file)

    # Add dynamix language legend
    add_dynamic_legend_to_component_graph(output_file, edge_to_languages, f"component_{index + 1}_graph")

### TODO Sort languages in language legend

In [None]:
def save_results_to_file(results, filename):
    """
    Saves results to a JSON file.
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)
        

def random_walk_in_component(component_graph, start_node, walk_length):
    """
    Performs a random walk in a component, prioritizing unvisited nodes. ?
    """
    walk = [start_node]
    current_node = start_node

    for _ in range(walk_length - 1):
        neighbors = list(component_graph.neighbors(current_node))
        #  prioritizing unvisited nodes. ?
        #unvisited_neighbors = [n for n in neighbors if n not in walk]
        #if unvisited_neighbors:
        #    next_node = random.choice(unvisited_neighbors)
        #else:
        next_node = random.choice(neighbors)

        walk.append(next_node)
        current_node = next_node

    return walk


def find_longest_path_approx(component_graph, max_depth=None):
    """
    Get the longest path.
    """
    longest_path = []
    for node in component_graph.nodes:
        # Depth is 2 * number of nodes
        if max_depth is None:
            max_depth = 2 * component_graph.number_of_nodes()
        path = nx.single_source_shortest_path(component_graph, node, cutoff=max_depth)
        longest_path = max(longest_path, max(path.values(), key=len), key=len)
    return longest_path

# Hauptfunktion zur Verarbeitung aller Komponenten
def process_all_components(new_graph):
    """
    Processes all connected components of the graph.
    """
    components = list(nx.connected_components(new_graph))

    for index, component in enumerate(components):

        # Create a visualisation for the component
        create_html(component=component, index=index)
        
        component_graph = new_graph.subgraph(component)

        # Get the longest path
        longest_path = find_longest_path_approx(component_graph)

        # random walk
        start_node = list(component_graph.nodes)[0]  # start
        walk_length = settings.WALK_MULTIPLIER * len(longest_path)  # Length of Random Walks = <WALK_MULTIPLIER> * Length of longest path
        random_walk = random_walk_in_component(component_graph, start_node, walk_length)

        # Create summary
        summary = {
            "component_id": index + 1,
            "num_nodes": component_graph.number_of_nodes(),
            "num_edges": component_graph.number_of_edges(),
            "density": nx.density(component_graph),
            "longest_path": longest_path,
            "random_walk": random_walk
        }

        # Save
        save_results_to_file(summary, f"component_graphs/component_{index + 1}_summary.json")

        print(f"Component {index + 1} processed and saved.")

process_all_components(new_graph)

Dynamic legend added to component_graphs/component_1_graph.html for component component_1_graph.
Component 1 processed and saved.
Dynamic legend added to component_graphs/component_2_graph.html for component component_2_graph.
Component 2 processed and saved.
Dynamic legend added to component_graphs/component_3_graph.html for component component_3_graph.
Component 3 processed and saved.
Dynamic legend added to component_graphs/component_4_graph.html for component component_4_graph.
Component 4 processed and saved.
Dynamic legend added to component_graphs/component_5_graph.html for component component_5_graph.
Component 5 processed and saved.
Dynamic legend added to component_graphs/component_6_graph.html for component component_6_graph.
Component 6 processed and saved.
Dynamic legend added to component_graphs/component_7_graph.html for component component_7_graph.
Component 7 processed and saved.
Dynamic legend added to component_graphs/component_8_graph.html for component component_8_

KeyboardInterrupt: 