## Learning languages from a single message

In this example we'll see how this algorithm has the power to accurately identify languages after seeing messages of very short length.

In [1]:
import os
from random import shuffle, sample, seed
import pandas as pd
from collections import defaultdict
from itertools import combinations
import networkx as nx
from statistics import median
from dataset_utils.sample_dataset import sample_dataset
seed(42)

In [163]:
path_to_samples = os.path.join('dataset', 'samples')
samples = defaultdict()
languages = ['en', 'es']
for language in languages:
    path_to_samples_language = os.path.join(path_to_samples,
                                f'{language}_samples_len_10.txt')
    with open(path_to_samples_language, 'r') as file:
        samples[language] = file.read()

In [164]:
for language in languages:
    samples[language] = samples[language].split('\n')

In [165]:
samples_limited = defaultdict()

In [166]:
n_of_samples = 1000
for language in languages:
    samples_limited[language] = sample(samples[language], n_of_samples)

In [167]:
def similarity_score(string1, string2):
    string1 = string1.split(' ')
    string2 = string2.split(' ')
    intersection = [x for x in string1 if x in string2]
    if len(intersection) == 0:
        return float('inf')
    else:
        return 1/(len(intersection)**4)

In [169]:
combined_list = samples_limited['en'] + samples_limited['es']

x = list(enumerate(combined_list))
shuffle(x)
indices, combined_list = zip(*x)

labels = [0 if y < n_of_samples else 1 for y in indices]

In [170]:
indices[:10], labels[:10], combined_list[:10]

((454, 773, 198, 609, 450, 1255, 1765, 985, 143, 816),
 [0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
 ('objective to reduce greenhouse gas experience I know that heading',
  'benefit from social is that the Council did not agree',
  'and to guarantee that the privacy of their communications is',
  'the utmost we had to give reasons for the existence',
  'we now all know is in no position to bring',
  'ha aprendido la lección tras la crisis Donata Gottardi califica',
  'justificación parece de sentido los recursos como el bacalao por',
  'develop and the maintenance of ecosystems should become a fundamental',
  'like to thank Mr Graefe zu Baringdorf for a very',
  'Pacific entire dispute with the United has arisen because we'))

In [171]:
graph = []
i = 0
for combination in combinations(x, 2):
    similarity = similarity_score(combination[0][1], combination[1][1])
    if similarity == float('inf'):
        continue
    graph.append( ( combination[0][0], combination[1][0],
            similarity_score(combination[0][1], combination[1][1]) ) )

In [172]:
len(graph)

782798

In [173]:
G = nx.Graph()

In [174]:
for edge in graph:
    G.add_edge(str(edge[0]), str(edge[1]), weight=edge[2])

In [175]:
from networkx.algorithms.shortest_paths.weighted import single_source_dijkstra

In [176]:
distances = single_source_dijkstra(G, '0')[0]
medijan = median(distances.values())
print(medijan)
predicted_labels = []
for x in range(n_of_samples*2):
    if x == 0:
        continue
    if distances[str(x)] > medijan:
        predicted_labels.append(1)
    else:
        predicted_labels.append(0)

0.014660493827160493


In [177]:
labelss = [0 if x < n_of_samples else 1 for x in range(n_of_samples-1)]
truth = [1 if (predicted_labels[x] == labelss[x]) else 0 for x in range(n_of_samples-1)]

In [178]:
sum(truth)/(n_of_samples-1)

0.9259259259259259