In [12]:
from quadrics import *
import scipy.stats as ss

In [13]:
def graph_from_file(file, limit=0):
    word_id = dict()
    edges = []
    s = file.readline()
    i = 0
    while s:
        w1, w2, weight = s.split(',')
        weight = float(weight)
        if w1 not in word_id:
            word_id[w1] = len(word_id)
        if w2 not in word_id:
            word_id[w2] = len(word_id)
        id1 = word_id[w1]
        id2 = word_id[w2]
        edges.append([id1, id2, weight])
        s = file.readline()
        i += 1
        if i == limit:
            break
    return Graph(len(word_id), len(edges), edges), word_id

In [14]:
%%time
rus_file = open('assoc_net.txt', encoding='utf-8')
rus_graph, rus_dict = graph_from_file(rus_file)

Wall time: 3.06 s


In [15]:
eng_file = open('assoc_eng2.txt')
eng_graph, eng_dict = graph_from_file(eng_file)

In [5]:
eng_graph.V, eng_graph.E

(5019, 63629)

In [6]:
rus_graph.V, rus_graph.E

(57108, 645056)

In [7]:
rg1 = rand_graph(eng_graph.V, eng_graph.E)

In [8]:
rg2 = rand_graph(rus_graph.V, rus_graph.E)

In [9]:
%%time
a = Graph.laplace_matrix(rg2)

CPU times: user 10.3 s, sys: 7.37 s, total: 17.6 s
Wall time: 17.6 s


In [0]:
%%time
#a = eigenvalues(a)

In [12]:
%%time
b = eig_fast(a)

CPU times: user 17min 5s, sys: 1min 11s, total: 18min 17s
Wall time: 13min 50s


In [0]:
a.sort()
len(a)

5019

In [9]:
def wasserstein_distance1(a, b):
    Len = min(len(a), len(b))
    return ss.wasserstein_distance(a[:Len], b[:Len])

def wasserstein_distance2(a, b):
    if len(a) > len(b):
        a, b = b, a
    return ss.wasserstein_distance(a + [0] * (len(b) - len(a)), b)

def save(vector, string):
    f = open(string, 'w')
    print(vector, file=f)
    f.close()

In [10]:
def compare(func):
    M1 = func(eng_graph)
    M2 = func(rus_graph)
    rm1 = func(rg1)
    rm2 = func(rg2)
    e1 = eigenvalues(M1)
    e3 = eigenvalues(rm1)
    print('Distance between random1 and english', wasserstein_distance1(e1, e3))
    save(e1, str(func) + '_eng_')
    e2 = eig_fast(M2)
    save(e2, str(func) + '_rus_')
    e4 = eig_fast(rm2)
    save(e3, str(func) + '_rand1_')
    save(e4, str(func) + '_rand2_')
    print('Distance between russian and english', wasserstein_distance1(e1, e2))
    print('Distance between random2 and english', wasserstein_distance1(e1, e4))
    print('Distance between russian and random1', wasserstein_distance1(e2, e3))
    print('Distance between russian and random2', wasserstein_distance1(e2, e4))

In [13]:
compare(Graph.laplace_matrix_weighted)

Distance between random1 and english 11.108850910047304






Distance between russian and english 1.533838109430488
Distance between random2 and english 5.195970187330805
Distance between russian and random1 12.63634464816933
Distance between russian and random2 9.883216223450647


In [16]:
rg_normal = rand_graph(eng_graph.V, eng_graph.E)
rg_normal_matr = rg_normal.laplace_matrix_weighted()
eng_matr = eng_graph.laplace_matrix_weighted()
e_rg_normal = eigenvalues(rg_normal_matr)
e_eng_matr = eigenvalues(eng_matr)
print(ss.wasserstein_distance(e_rg_normal, e_eng_matr))

0.0


In [12]:
rg_exp = rand_graph(eng_graph.V, eng_graph.E, lambda: np.random.exponential(0.15, 1)[0])
rg_exp_matr = rg_exp.laplace_matrix_weighted()
e_rg_exp = eigenvalues(rg_exp_matr)
print(ss.wasserstein_distance(e_rg_exp, e_eng_matr))

2.3624077700243555


In [13]:
rg_gamma = rand_graph(eng_graph.V, eng_graph.E, lambda: np.random.gamma(0.25, 0.7, 1)[0])
rg_gamma_matr = rg_gamma.laplace_matrix_weighted()
e_rg_gamma = eigenvalues(rg_gamma_matr)
print(ss.wasserstein_distance(e_rg_gamma, e_eng_matr))

2.9223833679467206


In [14]:
f = open('assoc_net.txt', encoding='utf-8').readlines()
nums = []
for line in f:
    line = line.split(',')
    nums.append(float(line[-1]))

In [15]:
rg_mock = rand_graph(eng_graph.V, eng_graph.E, lambda: nums[randint(0, len(nums) - 1)])
rg_mock_matr = rg_mock.laplace_matrix_weighted()
e_mock_exp = eigenvalues(rg_mock_matr)
print(ss.wasserstein_distance(e_mock_exp, e_eng_matr))

2.122559550340521


In [16]:
f = open('assoc_eng2.txt', encoding='utf-8').readlines()
nums = []
for line in f:
    line = line.split(',')
    nums.append(float(line[-1]))

In [10]:
rg_mock = rand_graph(eng_graph.V, eng_graph.E, lambda: nums[randint(0, len(nums) - 1)])
rg_mock_matr = rg_mock.laplace_matrix_weighted()
e_mock_exp = eigenvalues(rg_mock_matr)
print(ss.wasserstein_distance(e_mock_exp, e_eng_matr))

NameError: name 'nums' is not defined