In [23]:
from collections import defaultdict
import networkx as nx
from networkx import NetworkXError
import pandas as pd
from pyvis.network import Network


import settings

df = pd.read_parquet("cleaned_dataframe.parquet", engine="pyarrow")
df

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Afrikaans;;vrye;;adj
1,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Albanian;;lirë (i/e);;adj
2,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Amharic;;ነፃ;;adj
3,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Arabic;;حُرّ;;adj
4,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Arabic;;حر;;adj
...,...,...
1188819,fawn response;;(psychology) An overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,Finnish;;miellyttämisreaktio;;noun
1188820,fawn response;;(psychology) An overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,German;;Bambi-Reflex;;noun
1188821,fawn response;;(psychology) An overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,German;;Unterwerfungsreaktion;;noun
1188822,"one heart, one soul;;(Philippines, Catholicism) A saying that is said in every prayer, before the sign of the cross.",Latin;;cor ūnum et anima ūna;;phrase


In [24]:
# Count trasnaltions per language
language_counts = df['word_and_gloss'].value_counts()

# filter languages with at least 40 translations
languages_to_keep = language_counts[language_counts >= settings.MIN_LANGUAGE_COUNT].index

# keep rows with filteres languages
df_filtered = df[df['word_and_gloss'].isin(languages_to_keep)]

df_filtered.reset_index(drop = True, inplace=True)
df_filtered

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Afrikaans;;vrye;;adj
1,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Albanian;;lirë (i/e);;adj
2,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Amharic;;ነፃ;;adj
3,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Arabic;;حُرّ;;adj
4,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Arabic;;حر;;adj
...,...,...
550573,call the fire department;;(US) Call the emergency service that specializes in extinguishing fires.,Tagalog;;tawag ka ng bumbero;;phrase
550574,call the fire department;;(US) Call the emergency service that specializes in extinguishing fires.,Turkish;;itfaiyeyi çağırin;;phrase
550575,call the fire department;;(US) Call the emergency service that specializes in extinguishing fires.,Turkish;;itfaiyeyi çağır;;phrase
550576,call the fire department;;(US) Call the emergency service that specializes in extinguishing fires.,Ukrainian;;ви́кличте поже́жників;;phrase


In [25]:
len(df_filtered['word_and_gloss'].unique())

6837

In [26]:
translation_counts = df_filtered["language_and_translation"].value_counts()


# filter translations that appear more than once
translations_to_keep = translation_counts[translation_counts > 1].index

df_filtered = df_filtered[df_filtered['language_and_translation'].isin(translations_to_keep)]

df_filtered.reset_index(drop = True, inplace=True)
pd.set_option("display.max_rows", 20)
pd.set_option('display.max_colwidth', None)
df_filtered

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Arabic;;حُرّ;;adj
1,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Armenian;;ազատ;;adj
2,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Assamese;;মুকলি;;adj
3,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Asturian;;llibre;;adj
4,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Azerbaijani;;azad;;adj
...,...,...
119351,"Shaitan;;(Islam) Iblis, Satan.",Turkish;;şeytan;;name
119352,"Shaitan;;(Islam) Iblis, Satan.",Turkmen;;şeýtan;;name
119353,"Shaitan;;(Islam) Iblis, Satan.",Urdu;;شَیطان;;name
119354,"Shaitan;;(Islam) Iblis, Satan.",Uyghur;;شەيتان;;name


In [27]:
unique_lang_and_trans = df_filtered["language_and_translation"].unique()
unique_lang_and_trans

array(['Arabic;;حُرّ;;adj', 'Armenian;;ազատ;;adj', 'Assamese;;মুকলি;;adj',
       ..., 'Spanish;;aguar;;verb', 'Swedish;;vattna;;verb',
       'Portuguese;;dígito;;noun'], dtype=object)

In [28]:
df_filtered["language_and_translation"].nunique()

51902

In [29]:
concept_map = dict()
concept_list = set()
def add_concept_to_translation(translation: str, concept: str) -> None:
    concept_list.add(concept)
    if not concept in concept_map:
        concept_map[concept] = set()
    concept_map[concept].add(translation)

df_filtered.apply(lambda row: add_concept_to_translation(row["language_and_translation"], row["word_and_gloss"]), axis=1)

maximum = 0
for key, value in concept_map.items():
    if len(value) > maximum:
        maximum = len(value)
        print(f"{maximum:2d} | {key:32s} | {value}")

59 | free;;(social) Unconstrained.;;Not imprisoned or enslaved. | {'Assamese;;মুকলি;;adj', 'German;;frei;;adj', 'Macedonian;;слободен;;adj', 'Galician;;libre;;adj', 'Interlingua;;libere;;adj', 'Slovak;;slobodný;;adj', 'Old Irish;;sóer;;adj', 'Czech;;svobodný;;adj', 'Arabic;;حُرّ;;adj', 'Portuguese;;livre;;adj', 'Spanish;;libre;;adj', 'Zazaki;;azad (diq);;adj', 'Bambara;;hɔrɔn;;adj', 'Danish;;fri;;adj', 'Finnish;;vapaa;;adj', 'Asturian;;llibre;;adj', 'Indonesian;;bebas;;adj', 'Estonian;;vaba;;adj', 'Icelandic;;frjáls;;adj', 'Scottish Gaelic;;saor;;adj', 'Esperanto;;libera;;adj', 'Hebrew;;חָפְשִׁי;;adj', 'Serbo-Croatian;;slobodan;;adj', 'Hungarian;;szabad;;adj', 'Irish;;saor;;adj', 'French;;libre;;adj', 'Armenian;;ազատ;;adj', 'Greek;;ελεύθερος;;adj', 'Norwegian;;fri;;adj', 'Romanian;;liber;;adj', 'Zazaki;;xoser;;adj', 'Swedish;;fri;;adj', 'Ido;;libera;;adj', 'Old English;;frēo;;adj', 'Catalan;;lliure;;adj', 'Korean;;자유;;adj', 'Dutch;;vrij;;adj', 'Northern Kurdish;;serbest;;adj', 'Low Ger

In [30]:
len(concept_list)

5310

In [31]:
concept_map

{'free;;(social) Unconstrained.;;Not imprisoned or enslaved.': {'Arabic;;حُرّ;;adj',
  'Armenian;;ազատ;;adj',
  'Assamese;;মুকলি;;adj',
  'Asturian;;llibre;;adj',
  'Azerbaijani;;azad;;adj',
  'Bambara;;hɔrɔn;;adj',
  'Bashkir;;ирекле;;adj',
  'Catalan;;lliure;;adj',
  'Chinese Mandarin;;自由的;;adj',
  'Czech;;svobodný;;adj',
  'Czech;;volný;;adj',
  'Danish;;fri;;adj',
  'Dutch;;vrij;;adj',
  'Esperanto;;libera;;adj',
  'Estonian;;vaba;;adj',
  'Finnish;;vapaa;;adj',
  'French;;libre;;adj',
  'Galician;;libre;;adj',
  'German;;frei;;adj',
  'Greek;;ελεύθερος;;adj',
  'Hebrew;;חָפְשִׁי;;adj',
  'Hebrew;;חופשי;;adj',
  'Hungarian;;szabad;;adj',
  'Icelandic;;frjáls;;adj',
  'Ido;;libera;;adj',
  'Indonesian;;bebas;;adj',
  'Interlingua;;libere;;adj',
  'Irish;;saor;;adj',
  'Italian;;libero;;adj',
  'Japanese;;自由;;adj',
  'Korean;;자유;;adj',
  'Latvian;;brīvs;;adj',
  'Limburgish;;vrie;;adj',
  'Low German;;free;;adj',
  'Macedonian;;слободен;;adj',
  'Malay;;bebas;;adj',
  'Malayalam;;സ്വ

In [32]:
from typing import Any


G = nx.Graph()

class Edge(object):
    def __init__(self, weight: int, value: Any) -> None:
        self._weight = weight
        self._value = value

    @property
    def weight(self) -> int:
        return self._weight

    @weight.setter
    def weight(self, value: int) -> None:
        if not isinstance(value, int):
            raise ValueError("TODO")
        self._weight = value

    @property
    def value(self) -> Any:
        return self._value

    def normalized(self, maximum: int) -> float:
        return self._weight / maximum

    def __repr__(self):
        return f"Weight: {self._weight} Translations: {self._value}"

# Takes about 30 seconds on my machine
edge_map = dict() 
maximum = 0
for i, concept1 in enumerate(concept_list):
    for concept2 in concept_list:
        if concept1 == concept2:
            continue
        
        pair = tuple(sorted([concept1, concept2]))
        if intersection := concept_map[concept1].intersection(concept_map[concept2]):
            edge_map[pair] = Edge(len(intersection), intersection)
            if len(intersection) > maximum:
                maximum = len(intersection)




# Add edges to the graph with weights
for pair, edge in edge_map.items():
    G.add_edge(pair[0], pair[1], weight=edge.weight)

edges = G.edges(data=True)
weights = [edge.weight for edge in edge_map.values()]


In [33]:
# Meaningful comment

sorted_edges = sorted(edges, key = lambda edge: edge[2]['weight'], reverse=True)
print("Strongest connections:")
for parent1, parent2, edge_data in sorted_edges[:5]:
    print(f"(weight: {edge_data['weight']}) {parent1} -- {parent2}")

print("\nWeakest connections:")
for parent1, parent2, edge_data in sorted_edges[-5:]:
    print(f"(weight: {edge_data['weight']}) {parent1} -- {parent2}")

Strongest connections:
(weight: 137) football;;(UK, Africa, Caribbean, South Asia, uncountable) Association football, also called soccer: a game in which two teams each contend to get a round ball into the other team's goal primarily by kicking the ball. -- soccer;;(originated, late 19th C, now often US, Australia, Ireland, Philippines, and other countries; see usage notes) Association football.
(weight: 129) illness;;(countable) An instance of a disease or poor health. -- disease;;(medicine) An abnormal condition of a human, animal or plant that causes discomfort or dysfunction; distinct from injury insofar as the latter is usually instantaneously acquired.
(weight: 122) speak;;(intransitive) To communicate with one's voice, to say words out loud. -- talk;;(intransitive) To communicate, usually by means of speech.
(weight: 116) soil;;(uncountable) A mixture of mineral particles and organic material, used to support plant growth. -- earth;;(uncountable) Soil.
(weight: 115) embrace;;(tr

In [34]:
# Meaningful comment

degree_centrality = nx.degree_centrality(G)
sorted_centrality = sorted(degree_centrality.items(), key = lambda item: item[1], reverse = True)
print("Most central concepts:")
for concept, centrality in sorted_centrality[:5]:
    print(f"{concept}: {centrality}")

Most central concepts:
hold;;(transitive) To grasp or grip.: 0.014315313618383877
fuck;;(vulgar, colloquial, intransitive) To have sexual intercourse; to copulate.: 0.011678282162365794
get;;(transitive or ditransitive) To obtain; to acquire.: 0.010171407044641177
become;;(copulative) begin to be; turn into.: 0.009606328875494443
quit;;(transitive, intransitive) To stop, give up (an activity). [(usually) with gerund; or with verbal noun]: 0.009417969485778867


In [135]:
def create_graph(focus_concept: str) -> None:
    """
    Creates an interactive network and saves it as a HTML file.
    
    Args:
        focus_concept (str): The concept for which the network should be created, e.g. "free"
    """
    ## Validate input TODO
    #focus_concept = focus_concept.lower()
    try:
        # Get all neighbors of the focus concept
        neighbors = list(G.neighbors(focus_concept))
    except NetworkXError:
        print(f"Invalid focus_concept={focus_concept}. Dataset does not contain this concept.")
        return

    # Create a pyvis network
    net = Network(notebook=False, height="750px", width="100%", bgcolor="#222222", font_color="white")

    # Create a subgraph containing the focus concept and its neighbors
    local_subgraph = G.subgraph([focus_concept] + neighbors)

    # Map edges to the languages that connect them
    edge_to_languages = defaultdict(tuple)

    subgraph_maximum = 0
    for pair in local_subgraph.edges():
        pair = tuple(sorted(list(pair)))
        edge = edge_map[pair]
        edge_to_languages[pair] = edge.value

        if edge.weight > subgraph_maximum:
            subgraph_maximum = edge.weight


    # Add nodes to the pyvis network
    for node in local_subgraph.nodes:
        net.add_node(node, title=node, label=node.split(settings.SEPERATOR)[0])

    # Add edges with language information
    for pair, languages in edge_to_languages.items():
        edge: Edge = edge_map[pair]
        value = edge.weight
        width = edge.normalized(subgraph_maximum)
        net.add_edge(*pair, value=value, languages=', '.join(languages), width=width)

    # Save graph to a temporary file
    temp_file = "temp_network.html"
    net.save_graph(temp_file)

    # Add dynamic legend to the HTML file
    add_dynamic_legend(temp_file, edge_to_languages, focus_concept)

    print(f"Saved Network '{focus_concept}' under 'networks/network_{focus_concept}.html'.")


def add_dynamic_legend(html_file: str, edge_to_languages: dict, focus_concept: str) -> None:
    """
    Adds a dynamic legend to the HTML file that updates on click.
    
    Args:
        html_file (str): The path to the HTML file.
        edge_to_languages (dict): A dictionary mapping edges to languages.
        focus_concept (str): The focus concept.
    """
    
    js_code = """
    <script>
        // Function to update the legend
        function updateLegend(content) {
            const legend = document.getElementById('language-legend');
            legend.innerHTML = content;
        }

        // Add event listeners to edges using vis.js API
        network.on("hoverEdge", function (params) {
            const edgeId = params.edge;
            const edge = edges.get(edgeId);
            if (edge && edge.languages) {
                const languagesList = edge.languages.split(', ').map(lang => `<li>${lang}</li>`).join('');
                const numLanguages = edge.languages.split(', ').length;
                const nodePair = `${edge.from} - ${edge.to}`;
                updateLegend(`<h3>Languages (${numLanguages}) for ${nodePair}:</h3><ul>${languagesList}</ul>`);
            }
        });

        network.on("blurEdge", function () {
            updateLegend('<h3>Languages:</h3><p>Hover over an edge to see languages.</p>');
        });

        network.on("click", function (params) {
            if (params.edges && params.edges.length > 0) {
                const edgeId = params.edges[0];
                const edge = edges.get(edgeId);
                if (edge && edge.languages) {
                    const languagesList = edge.languages.split(', ').map(lang => `<li>${lang.replaceAll(";;", " - ")}</li>`).join('');
                    const numLanguages = edge.languages.split(', ').length;
                    const title1 = edge.from.replaceAll(";;", "&#10;");
                    const title2 = edge.to.replaceAll(";;", "&#10;");
                    updateLegend(`<h2><div title="${title1}";>${edge.from.split(";;")[0]}</div><div title="${title2}";>${edge.to.split(";;")[0]}</div></h2><br><h3>Languages (${numLanguages})<br></h3><ul>${languagesList}</ul>`);
                }
            }
        });
    </script>
    <style>
        .container {
            display: flex;
            width: 100%;
            margin-top: 48px;
        }
        #mynetwork {
            flex: 3;
            height: 750px;
            width: 90%
            background-color: #222222;
            border: 1px solid lightgray;
        }
        #language-legend h2 {
            display: flex; 
            flex-direction: column;
            font-weight: bold;
        }
        #language-legend h3 {
             border-bottom: 1px solid black; 
             padding-bottom: 8px
        }
        #language-legend {
            flex: 1;
            margin-left: 20px;
            padding: 10px;
            border: 1px solid #ccc;
            border-radius: 5px;
            background-color: #f9f9f9;
            overflow-y: auto;
            max-height: 750px;
        }
        #language-legend ul {
            list-style-type: none;
            padding: 0;
        }
        #language-legend li {
            margin: 5px 0;
            font-size: 1.2em;
        }
    </style>
    """

    # Create the initial legend HTML
    legend_html = """
    <div id="language-legend">
        <h3>Languages:</h3>
        <p>Click on an edge to see languages.</p>
    </div>
    """

    # Read the HTML file
    with open(html_file, "r", encoding="utf-8") as f:
        html_content = f.read()

    # Put the network and legend in a flex container
    html_content = html_content.replace(
        '<div id="mynetwork" class="card-body"></div>',
        f'<div class="container"><div id="mynetwork" class="card-body"></div>{legend_html}</div>'
    )

    # Insert the JavaScript before the closing </body> tag
    html_content = html_content.replace(
        '</body>',
        f'{js_code}</body>'
    )

    # Save
    output_file = f"networks/network_{focus_concept}.html"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(html_content)

In [136]:
create_graph("pyramid;;(geometry) A solid with triangular lateral faces and a polygonal (often square or rectangular) base.")
create_graph("free;;(social) Unconstrained.;;Not imprisoned or enslaved.")

Saved Network 'pyramid;;(geometry) A solid with triangular lateral faces and a polygonal (often square or rectangular) base.' under 'networks/network_pyramid;;(geometry) A solid with triangular lateral faces and a polygonal (often square or rectangular) base..html'.
Saved Network 'free;;(social) Unconstrained.;;Not imprisoned or enslaved.' under 'networks/network_free;;(social) Unconstrained.;;Not imprisoned or enslaved..html'.


In [37]:
# for i, word in enumerate(sorted_edges[1000::-1]):
#     word = word[0]
#     create_graph(word)
#     if i >= 9:
#         break

# print()

In [38]:
# create_graph("God") #TODO make the dataset lower

In [39]:
# def create_simplified_network(max_neighbors: int = 2) -> Network:
#     """
#     Erstellt ein vereinfachtes Netzwerk, bei dem jeder Knoten nur die `max_neighbors` stärksten Verbindungen hat.
    
#     Args:
#         max_neighbors (int): Maximale Anzahl von Nachbarn, die für jeden Knoten angezeigt werden sollen.
    
#     Returns:
#         Network: Ein pyvis-Netzwerkobjekt.
#     """
#     # Create new Network
#     simplified_net = Network(notebook=True, height="750px", width="100%", bgcolor="#222222", font_color="white")

#     # Iterate over all Nodes in original Graph and add Node
#     for node in G.nodes:
#         simplified_net.add_node(node, title=node)

#         # Get the strongest connections
#         neighbors = list(G.neighbors(node))
#         sorted_neighbors = sorted(neighbors, key=lambda x: G[node][x]['weight'], reverse=True)
#         top_neighbors = sorted_neighbors[:max_neighbors]

#         # Add the strongest Connections and make sure the neighbour is also in the simplified network
#         for neighbor in top_neighbors:
#             if neighbor not in simplified_net.get_nodes():
#                 simplified_net.add_node(neighbor, title=neighbor)
            
#             # extract language
#             languages = set()
#             for word, concepts in word_to_concepts.items():
#                 if node in concepts and neighbor in concepts:
#                     language = word.split(settings.SEPERATOR)[0]  # Extract Language
#                     languages.add(language)
            
#             # Add Edge with Language Information
#             weight = G[node][neighbor]['weight']
#             simplified_net.add_edge(node, neighbor, value=weight, title=f"Languages: {', '.join(languages)}")

#     return simplified_net

In [40]:
# simplified_net = create_simplified_network(max_neighbors=2)
# simplified_net.save_graph("networks/simplified_network.html")