In [102]:
import networkx as nx
import pandas as pd
from collections import defaultdict
import settings

df = pd.read_parquet("cleaned_dataframe.parquet", engine="pyarrow")
df

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Afrikaans;;vrye;;adj
1,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Albanian;;lirë (i/e);;adj
2,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Amharic;;ነፃ;;adj
3,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Arabic;;حُرّ;;adj
4,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Arabic;;حر;;adj
...,...,...
1188819,fawn response;;(psychology) An overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,Finnish;;miellyttämisreaktio;;noun
1188820,fawn response;;(psychology) An overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,German;;Bambi-Reflex;;noun
1188821,fawn response;;(psychology) An overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,German;;Unterwerfungsreaktion;;noun
1188822,"one heart, one soul;;(Philippines, Catholicism) A saying that is said in every prayer, before the sign of the cross.",Latin;;cor ūnum et anima ūna;;phrase


In [103]:
# Zählen der Übersetzungen pro Sprache
language_counts = df['word_and_gloss'].value_counts()

# Filtern der Sprachen, die mindestens 40 Übersetzungen haben
languages_to_keep = language_counts[language_counts >= settings.MIN_LANGUAGE_COUNT].index

# Bereinigen des DataFrames: Behalten Sie nur die Zeilen, die zu den gefilterten Sprachen gehören
df_filtered = df[df['word_and_gloss'].isin(languages_to_keep)]

df_filtered.reset_index(drop = True, inplace=True)
df_filtered

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Afrikaans;;vrye;;adj
1,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Albanian;;lirë (i/e);;adj
2,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Amharic;;ነፃ;;adj
3,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Arabic;;حُرّ;;adj
4,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Arabic;;حر;;adj
...,...,...
550573,call the fire department;;(US) Call the emergency service that specializes in extinguishing fires.,Tagalog;;tawag ka ng bumbero;;phrase
550574,call the fire department;;(US) Call the emergency service that specializes in extinguishing fires.,Turkish;;itfaiyeyi çağırin;;phrase
550575,call the fire department;;(US) Call the emergency service that specializes in extinguishing fires.,Turkish;;itfaiyeyi çağır;;phrase
550576,call the fire department;;(US) Call the emergency service that specializes in extinguishing fires.,Ukrainian;;ви́кличте поже́жників;;phrase


In [104]:
len(df_filtered['word_and_gloss'].unique())

6837

In [105]:
#filtered_df = df[df['word_and_gloss'].str.startswith('head -')]
#filtered_df.reset_index(drop=True, inplace=True)
#filtered_df

In [106]:
# Anzahl der eindeutigen Konzepte und Übersetzungen
#num_concepts = filtered_df['word_and_gloss'].nunique()
#num_translations = filtered_df['language_and_translation'].nunique()

#print(f"Anzahl der eindeutigen Konzepte: {num_concepts}")
#print(f"Anzahl der eindeutigen Übersetzungen: {num_translations}")

In [107]:
from sklearn.feature_extraction.text import CountVectorizer

# Erstellen einer Co-Occurrence-Matrix
#co_occurrence_matrix = pd.crosstab(filtered_df['word_and_gloss'], filtered_df['language_and_translation'])
#co_occurrence_matrix.head()

In [108]:
from sklearn.metrics.pairwise import cosine_similarity
#concepts = filtered_df['word_and_gloss'].unique()

# Berechnen der Kosinus-Ähnlichkeit
#similarity_matrix = cosine_similarity(co_occurrence_matrix)
#similarity_df = pd.DataFrame(similarity_matrix, index=concepts, columns=concepts)
#similarity_df.head()

In [109]:
#distance_matrix = 1 - similarity_matrix
#distance_df = pd.DataFrame(distance_matrix, index=concepts, columns=concepts)
#distance_df.head()

In [110]:
import seaborn as sns
import matplotlib.pyplot as plt

# Heatmap der Distanzmatrix
#plt.figure(figsize=(8, 6))
#sns.heatmap(distance_df, annot=True, cmap='viridis', fmt='.2f')
#plt.title('Distanzmatrix zwischen Konzepten')
#plt.show()

## New Try

In [111]:
translation_counts = df_filtered["language_and_translation"].value_counts()


# Filtern der Uebersetzungen die mehr als einmal vorkommen
translations_to_keep = translation_counts[translation_counts > 1].index

df_filtered = df_filtered[df_filtered['language_and_translation'].isin(translations_to_keep)]

df_filtered.reset_index(drop = True, inplace=True)
pd.set_option("display.max_rows", 20)
pd.set_option('display.max_colwidth', None)
df_filtered

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Arabic;;حُرّ;;adj
1,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Armenian;;ազատ;;adj
2,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Assamese;;মুকলি;;adj
3,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Asturian;;llibre;;adj
4,free;;(social) Unconstrained.;;Not imprisoned or enslaved.,Azerbaijani;;azad;;adj
...,...,...
119351,"Shaitan;;(Islam) Iblis, Satan.",Turkish;;şeytan;;name
119352,"Shaitan;;(Islam) Iblis, Satan.",Turkmen;;şeýtan;;name
119353,"Shaitan;;(Islam) Iblis, Satan.",Urdu;;شَیطان;;name
119354,"Shaitan;;(Islam) Iblis, Satan.",Uyghur;;شەيتان;;name


In [112]:
unique_lang_and_trans = df_filtered["language_and_translation"].unique()
df_filtered["language_and_translation"].nunique()

unique_lang_and_trans

array(['Arabic;;حُرّ;;adj', 'Armenian;;ազատ;;adj', 'Assamese;;মুকলি;;adj',
       ..., 'Spanish;;aguar;;verb', 'Swedish;;vattna;;verb',
       'Portuguese;;dígito;;noun'], dtype=object)

In [113]:
df_filtered["language_and_translation"].nunique()

51902

In [121]:
context_map = {}
def add_context_to_translation(translation, context):
    if not translation in context_map:
        context_map[translation] = []
    context_map[translation].append(context)

df_filtered.apply(lambda row: add_context_to_translation(row["language_and_translation"], row["word_and_gloss"]), axis=1)

maximum = 0
for key, value in context_map.items():
    if len(value) > maximum:
        maximum = len(value)
        print(f"{maximum:2d} | {key:32s} | {value}")

 3 | Arabic;;حُرّ;;adj                | ['free;;(social) Unconstrained.;;Not imprisoned or enslaved.', 'free;;(social) Unconstrained.;;(software) With no or only freedom-preserving limitations on distribution or modification.', 'libre;;(software) With very few limitations on distribution or the right to access the source code to create improved versions, but not necessarily free of charge.']
 5 | Catalan;;lliure;;adj             | ['free;;(social) Unconstrained.;;Not imprisoned or enslaved.', 'free;;(social) Unconstrained.;;Without obligations.', 'free;;(social) Unconstrained.;;(software) With no or only freedom-preserving limitations on distribution or modification.', 'free;;(physical) Unconstrained.;;Unobstructed, without blockages.', 'libre;;(software) With very few limitations on distribution or the right to access the source code to create improved versions, but not necessarily free of charge.']
 6 | French;;libre;;adj               | ['free;;(social) Unconstrained.;;Not imprisone

In [124]:
len(context_map.keys())

51902