In [None]:
import pickle

import networkx as nx
import pandas as pd

from edge import Edge
import settings

In [2]:
def debug(msg: str = "") -> None:
    """
    Small method for controlling number of debug messages.
    """
    if settings.DEBUG:
        print(msg)


Load Dataframe

In [3]:
pd.set_option("display.max_rows", 20)
pd.set_option('display.max_colwidth', None)

df = pd.read_parquet("resources/load_and_preprocess/cleaned_dataframe.parquet", engine="pyarrow")
df

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) unconstrained.;;not imprisoned or enslaved.,afrikaans;;vrye;;adj
1,free;;(social) unconstrained.;;not imprisoned or enslaved.,albanian;;lirë (i/e);;adj
2,free;;(social) unconstrained.;;not imprisoned or enslaved.,amharic;;ነፃ;;adj
3,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حُرّ;;adj
4,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حر;;adj
...,...,...
1188486,fawn response;;(psychology) an overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,finnish;;miellyttämisreaktio;;noun
1188487,fawn response;;(psychology) an overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,german;;bambi-reflex;;noun
1188488,fawn response;;(psychology) an overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,german;;unterwerfungsreaktion;;noun
1188489,"one heart, one soul;;(philippines, catholicism) a saying that is said in every prayer, before the sign of the cross.",latin;;cor ūnum et anima ūna;;phrase


Languages with at least `MIN_LANGUAGE_COUNT` (default=40) translations are being kept as well as translations that 
appear more than once across different languages.
Translations that do not occur for different languages would not result in an edge later on.

In [4]:
# Count translations per language
language_counts = df['word_and_gloss'].value_counts()

# filter languages with at least 40 translations
languages_to_keep = language_counts[language_counts >= settings.MIN_LANGUAGE_COUNT].index

# keep rows with filteres languages
df_filtered = df[df['word_and_gloss'].isin(languages_to_keep)]

df_filtered.reset_index(drop = True, inplace=True)
df_filtered

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) unconstrained.;;not imprisoned or enslaved.,afrikaans;;vrye;;adj
1,free;;(social) unconstrained.;;not imprisoned or enslaved.,albanian;;lirë (i/e);;adj
2,free;;(social) unconstrained.;;not imprisoned or enslaved.,amharic;;ነፃ;;adj
3,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حُرّ;;adj
4,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حر;;adj
...,...,...
550348,call the fire department;;(us) call the emergency service that specializes in extinguishing fires.,tagalog;;tawag ka ng bumbero;;phrase
550349,call the fire department;;(us) call the emergency service that specializes in extinguishing fires.,turkish;;itfaiyeyi çağırin;;phrase
550350,call the fire department;;(us) call the emergency service that specializes in extinguishing fires.,turkish;;itfaiyeyi çağır;;phrase
550351,call the fire department;;(us) call the emergency service that specializes in extinguishing fires.,ukrainian;;ви́кличте поже́жників;;phrase


In [5]:
translation_counts = df_filtered["language_and_translation"].value_counts()

# filter translations that appear more than once
translations_to_keep = translation_counts[translation_counts > 1].index

df_filtered = df_filtered[df_filtered['language_and_translation'].isin(translations_to_keep)]

df_filtered.reset_index(drop = True, inplace=True)
df_filtered

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حُرّ;;adj
1,free;;(social) unconstrained.;;not imprisoned or enslaved.,armenian;;ազատ;;adj
2,free;;(social) unconstrained.;;not imprisoned or enslaved.,assamese;;মুকলি;;adj
3,free;;(social) unconstrained.;;not imprisoned or enslaved.,asturian;;llibre;;adj
4,free;;(social) unconstrained.;;not imprisoned or enslaved.,azerbaijani;;azad;;adj
...,...,...
119553,"shaitan;;(islam) iblis, satan.",turkish;;şeytan;;name
119554,"shaitan;;(islam) iblis, satan.",turkmen;;şeýtan;;name
119555,"shaitan;;(islam) iblis, satan.",urdu;;شَیطان;;name
119556,"shaitan;;(islam) iblis, satan.",uyghur;;شەيتان;;name


In [6]:
unique_lang_and_trans = df_filtered["language_and_translation"].unique()
unique_lang_and_trans

array(['arabic;;حُرّ;;adj', 'armenian;;ազատ;;adj', 'assamese;;মুকলি;;adj',
       ..., 'spanish;;aguar;;verb', 'swedish;;vattna;;verb',
       'portuguese;;dígito;;noun'], dtype=object)

In [7]:
df_filtered["language_and_translation"].nunique()

51999

Each concept is mapped to their corresponding language, translation and POS-Tag.

In [8]:
concept_map = dict()
def add_concept_to_translation(translation: str, concept: str) -> None:
    if not concept in concept_map:
        concept_map[concept] = set()
    concept_map[concept].add(translation)

df_filtered.apply(lambda row: add_concept_to_translation(row["language_and_translation"], row["word_and_gloss"]), axis=1)

maximum = 0
for key, value in concept_map.items():
    if len(value) > maximum:
        maximum = len(value)
        print(f"{maximum:2d} | {key:32s} | {value}")



59 | free;;(social) unconstrained.;;not imprisoned or enslaved. | {'bambara;;hɔrɔn;;adj', 'chinese mandarin;;自由的;;adj', 'galician;;libre;;adj', 'greek;;ελεύθερος;;adj', 'norman;;libre;;adj', 'icelandic;;frjáls;;adj', 'yiddish;;פֿרײַ;;adj', 'macedonian;;слободен;;adj', 'japanese;;自由;;adj', 'catalan;;lliure;;adj', 'asturian;;llibre;;adj', 'armenian;;ազատ;;adj', 'norwegian;;fri;;adj', 'serbo-croatian;;slobodan;;adj', 'malayalam;;സ്വതന്ത്രം;;adj', 'danish;;fri;;adj', 'portuguese;;livre;;adj', 'assamese;;মুকলি;;adj', 'indonesian;;bebas;;adj', 'arabic;;حُرّ;;adj', 'italian;;libero;;adj', 'czech;;volný;;adj', 'ido;;libera;;adj', 'low german;;free;;adj', 'persian;;آزاد;;adj', 'czech;;svobodný;;adj', 'french;;libre;;adj', 'russian;;свобо́дный;;adj', 'zazaki;;azad (diq);;adj', 'occitan;;liure;;adj', 'dutch;;vrij;;adj', 'estonian;;vaba;;adj', 'hungarian;;szabad;;adj', 'spanish;;libre;;adj', 'hebrew;;חופשי;;adj', 'limburgish;;vrie;;adj', 'hebrew;;חָפְשִׁי;;adj', 'romanian;;liber;;adj', 'zazaki;;xo

We have 5315 concepts.

In [9]:
len(concept_map)

5315

Show first entry of the Concept Map.

In [10]:
first_concept = dict(list(concept_map.items())[:1])
first_concept

{'free;;(social) unconstrained.;;not imprisoned or enslaved.': {'arabic;;حُرّ;;adj',
  'armenian;;ազատ;;adj',
  'assamese;;মুকলি;;adj',
  'asturian;;llibre;;adj',
  'azerbaijani;;azad;;adj',
  'bambara;;hɔrɔn;;adj',
  'bashkir;;ирекле;;adj',
  'catalan;;lliure;;adj',
  'chinese mandarin;;自由的;;adj',
  'czech;;svobodný;;adj',
  'czech;;volný;;adj',
  'danish;;fri;;adj',
  'dutch;;vrij;;adj',
  'esperanto;;libera;;adj',
  'estonian;;vaba;;adj',
  'finnish;;vapaa;;adj',
  'french;;libre;;adj',
  'galician;;libre;;adj',
  'german;;frei;;adj',
  'greek;;ελεύθερος;;adj',
  'hebrew;;חָפְשִׁי;;adj',
  'hebrew;;חופשי;;adj',
  'hungarian;;szabad;;adj',
  'icelandic;;frjáls;;adj',
  'ido;;libera;;adj',
  'indonesian;;bebas;;adj',
  'interlingua;;libere;;adj',
  'irish;;saor;;adj',
  'italian;;libero;;adj',
  'japanese;;自由;;adj',
  'korean;;자유;;adj',
  'latvian;;brīvs;;adj',
  'limburgish;;vrie;;adj',
  'low german;;free;;adj',
  'macedonian;;слободен;;adj',
  'malay;;bebas;;adj',
  'malayalam;;സ്വ

In [11]:
G = nx.Graph()

# Takes about 30 seconds on my machine
edge_map = dict() 
maximum = 0
for i, concept1 in enumerate(concept_map.keys()):
    for concept2 in concept_map.keys():
        if concept1 == concept2:
            continue
        
        pair = tuple(sorted([concept1, concept2]))
        if intersection := concept_map[concept1].intersection(concept_map[concept2]):
            edge_map[pair] = Edge(len(intersection), intersection)
            if len(intersection) > maximum:
                maximum = len(intersection)

# Add edges to the graph with weights
for pair, edge in edge_map.items():
    G.add_edge(pair[0], pair[1], weight=edge.weight)

pickle.dump(edge_map, open("resources/create_graph/edge_map.pickle", "wb")) 

edges = G.edges(data=True)
weights = [edge.weight for edge in edge_map.values()]


Inspecting strongest and weakest edges

In [12]:
sorted_edges = sorted(edges, key = lambda edge: edge[2]['weight'], reverse=True)
print("Strongest connections:")
for parent1, parent2, edge_data in sorted_edges[:5]:
    print(f"(weight: {edge_data['weight']}) {parent1} -- {parent2}")

print("\nWeakest connections:")
for parent1, parent2, edge_data in sorted_edges[-5:]:
    print(f"(weight: {edge_data['weight']}) {parent1} -- {parent2}")

Strongest connections:
(weight: 137) football;;(uk, africa, caribbean, south asia, uncountable) association football, also called soccer: a game in which two teams each contend to get a round ball into the other team's goal primarily by kicking the ball. -- soccer;;(originated, late 19th c, now often us, australia, ireland, philippines, and other countries; see usage notes) association football.
(weight: 129) disease;;(medicine) an abnormal condition of a human, animal or plant that causes discomfort or dysfunction; distinct from injury insofar as the latter is usually instantaneously acquired. -- illness;;(countable) an instance of a disease or poor health.
(weight: 122) talk;;(intransitive) to communicate, usually by means of speech. -- speak;;(intransitive) to communicate with one's voice, to say words out loud.
(weight: 116) earth;;(uncountable) soil. -- soil;;(uncountable) a mixture of mineral particles and organic material, used to support plant growth.
(weight: 115) hug;;(transi

Keep only edges with a weight of at least `WEIGHT_THRESHOLD` translations.
Save resulting graph to file as python binary.

In [13]:
edges_to_remove = [(u, v) for u, v, data in G.edges(data=True) if data['weight'] < settings.WEIGHT_THRESHOLD]
G.remove_edges_from(edges_to_remove)

pickle.dump(G, open('resources/create_graph/full_graph.pickle', 'wb'))