In [1]:
import os
from random import shuffle, sample, seed
import pandas as pd
from collections import defaultdict
from itertools import combinations
import networkx as nx
from statistics import median
seed(42)

In [2]:
path_to_samples = os.path.join('dataset', 'samples')
samples = defaultdict()
languages = ['en', 'de']
for language in languages:
    path_to_samples_language = os.path.join(path_to_samples,
                                f'{language}_samples_len_10.txt')
    with open(path_to_samples_language, 'r') as file:
        samples[language] = file.read()

In [3]:
for language in languages:
    samples[language] = samples[language].split('\n')

In [4]:
len(samples['en']), len(samples['de'])

(361764, 284323)

In [89]:
samples_limited = defaultdict()

In [90]:
n_of_samples = 2000
for language in languages:
    samples_limited[language] = sample(samples[language], n_of_samples)

In [77]:
len(samples_limited['en']), len(samples_limited['de'])

(1000, 1000)

In [78]:
def similarity_score(string1, string2):
    string1 = string1.split(' ')
    string2 = string2.split(' ')
    intersection = [x for x in string1 if x in string2]
    if len(intersection) == 0:
        return float('inf')
    else:
        return 1/(len(intersection)**4)

In [79]:
combined_list = samples_limited['en'] + samples_limited['de']

x = list(enumerate(combined_list))
shuffle(x)
indices, combined_list = zip(*x)

labels = [0 if y < n_of_samples else 1 for y in indices]

In [80]:
indices[:10], labels[:10], combined_list[:10]

((1116, 1757, 1519, 1431, 1461, 81, 373, 1675, 76, 3),
 [1, 1, 1, 1, 1, 0, 0, 1, 0, 0],
 ('stellte meine Kollegin Frau Gradin ein Memorandum über den Beitrag',
  'der klassischen Diplomatie ist in diesem Fall völlig mehr den',
  'problemlos verlaufen können jedoch intelligent gestaltet also die durch sie',
  'Seite dass sie eine Demokratie auf Grundlagen die so fundamental',
  'Europa auch im Bereich der Flugsicherheit eine führende Rolle als',
  'same conditions and objectives of our financial assistance that the',
  'must engage the whole of the institutions of including we',
  'ihrer aber auch mit der Kriminalität im Zusammenhang mit illegalen',
  'Court of Justice and account for in detailed to the',
  'they are prepared to do also should this become that'))

In [81]:
graph = []
i = 0
for combination in combinations(x, 2):
    similarity = similarity_score(combination[0][1], combination[1][1])
    if similarity == float('inf'):
        continue
    graph.append( ( combination[0][0], combination[1][0],
            similarity_score(combination[0][1], combination[1][1]) ) )

In [82]:
len(graph)

669350

In [83]:
G = nx.Graph()

In [84]:
for edge in graph:
    G.add_edge(str(edge[0]), str(edge[1]), weight=edge[2])

In [85]:
from networkx.algorithms.shortest_paths.weighted import single_source_dijkstra

In [86]:
distances = single_source_dijkstra(G, '0')[0]
medijan = median(distances.values())
print(medijan)
predicted_labels = []
for x in range(n_of_samples*2):
    if x == 0:
        continue
    if distances[str(x)] > medijan:
        predicted_labels.append(1)
    else:
        predicted_labels.append(0)

0.015611651523799239


In [87]:
labelss = [0 if x < n_of_samples else 1 for x in range(n_of_samples-1)]
truth = [1 if (predicted_labels[x] == labelss[x]) else 0 for x in range(n_of_samples-1)]

In [88]:
sum(truth)/(n_of_samples-1)

0.988988988988989