In [1]:
import sys
import os
import random
# Counter
from collections import Counter
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
from somhos.methods.useful import save_pickle, load_pickle, wordvectors_centroid
from somhos.config.paths import *

Default path

In [2]:
prefix_path = "../../"
data_path = get_relative_path(prefix_path, V9GAMMA_PATH)
os.path.exists(data_path)

True

Load samples

In [3]:
# Load sample A
docs_sample_a = load_pickle(get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX))
# Load sample B
docs_sample_b = load_pickle(get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX))

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (5000, 5000)


Load corpus

In [4]:
doc_directory = load_pickle(get_relative_path(data_path, DOC_DIRECTORY))
# Corpus - tokens
corpus_bag_of_words = load_pickle(get_relative_path(data_path, CORPUS_BAG_OF_WORDS))
corpus_tokens = load_pickle(get_relative_path(data_path, CORPUS_TOKENS))
dictionary_tokens = load_pickle(get_relative_path(data_path, DICTIONARY_TOKENS))
# Corpus - keyphrases
corpus_bag_of_keyphrases = load_pickle(get_relative_path(data_path, CORPUS_BAG_OF_KEYPHRASES))
corpus_keyphrases = load_pickle(get_relative_path(data_path, CORPUS_KEYPHRASES))
dictionary_keyphrases = load_pickle(get_relative_path(data_path, DICTIONARY_KEYPHRASES))

print(len(doc_directory))
print("Test:", doc_directory[docs_sample_a[0]])
print("Test:", doc_directory[docs_sample_b[0]])

# print("Test:", corpus_bag_of_words[doc_directory[docs_sample_a[0]]])
# print("Test:", corpus_bag_of_words[doc_directory[docs_sample_b[0]]])
print(len(corpus_bag_of_words), len(dictionary_tokens))
print(len(corpus_bag_of_keyphrases), len(dictionary_keyphrases))

10000
Test: 2111
Test: 483
10000 62388
10000 107954


In [5]:
corpus_a_tokens = [corpus_bag_of_words[doc_directory[docid]] for docid in docs_sample_a]
corpus_b_tokens = [corpus_bag_of_words[doc_directory[docid]] for docid in docs_sample_b]
corpus_a_keyphrases = [corpus_bag_of_keyphrases[doc_directory[docid]] for docid in docs_sample_a]
corpus_b_keyphrases = [corpus_bag_of_keyphrases[doc_directory[docid]] for docid in docs_sample_b]

print(len(corpus_a_tokens), len(corpus_b_tokens), len(corpus_a_keyphrases), len(corpus_b_keyphrases))

5000 5000 5000 5000


In [6]:
import numpy as np
from gensim import models
from gensim.similarities import Similarity
from gensim.test.utils import get_tmpfile
from scipy.spatial.distance import cosine as cosine_distance
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## TFIDF

In [7]:
model_tf_idf_tokens = models.TfidfModel(corpus_bag_of_words)
model_tf_idf_keyphrases = models.TfidfModel(corpus_bag_of_keyphrases)

2018-09-21 16:17:21,493 : INFO : collecting document frequencies
2018-09-21 16:17:21,496 : INFO : PROGRESS: processing document #0
2018-09-21 16:17:21,833 : INFO : calculating IDF weights for 10000 documents and 62387 features (988950 matrix non-zeros)
2018-09-21 16:17:22,025 : INFO : collecting document frequencies
2018-09-21 16:17:22,026 : INFO : PROGRESS: processing document #0
2018-09-21 16:17:22,128 : INFO : calculating IDF weights for 10000 documents and 107953 features (232171 matrix non-zeros)


Example

In [13]:
def difference(index_doc1, index_doc2, corpus):
    doc1 = set(corpus[index_doc1])
    doc2 = set(corpus[index_doc2])
    symmdiff = doc1 ^ doc2
    doc1_diff = doc1 & symmdiff
    doc2_diff = doc2 & symmdiff
    return doc1_diff, doc2_diff

def filter_bow(index_doc, types_doc, bow):
    return list(filter(lambda x: x[0] in types_doc, bow[index_doc]))

def difference_bow(index_doc1, index_doc2, corpus, bow):
    doc1_diff, doc2_diff = difference(index_doc1, index_doc2, corpus)
    doc1_bow = filter_bow(index_doc1, doc1_diff, bow)
    doc2_bow = filter_bow(index_doc2, doc2_diff, bow)
    return doc1_bow, doc2_bow

def intersection(index_doc1, index_doc2, corpus):
    doc1 = set(corpus[index_doc1])
    doc2 = set(corpus[index_doc2])
    intersection = doc1 & doc2
    doc1_intersection = doc1 & intersection
    doc2_intersection = doc2 & intersection
    return doc1_intersection, doc2_intersection

def intersection_bow(index_doc1, index_doc2, corpus, bow):
    doc1_diff, doc2_diff = intersection(index_doc1, index_doc2, corpus)
    doc1_bow = filter_bow(index_doc1, doc1_diff, bow)
    doc2_bow = filter_bow(index_doc2, doc2_diff, bow)
    return doc1_bow, doc2_bow

# types_a_diff, types_b_diff = difference(1952, 6674, corpus_keyphrases)
# print(types_a_diff)
# print(types_b_diff)
# print(filter_bow(1952, types_a_diff, corpus_bag_of_keyphrases))
# print("++++++++++++++")



def dnorm(dvec):
    return np.sqrt(sum(map(lambda x: x[1]**2, dvec)))

def ddot(dvec1, dvec2):
    d1 = dict(dvec1)
    d2 = dict(dvec2)
    return sum(d1[key]*d2.get(key, 0.0) for key in d1)

def dcosine(dvec1, dvec2):
    return ddot(dvec1, dvec2)/(dnorm(dvec1)*dnorm(dvec2))

bow_a, bow_b = difference_bow(1000, 3001, corpus_keyphrases, corpus_bag_of_keyphrases)
tfidf_a = model_tf_idf_keyphrases[bow_a]
tfidf_b = model_tf_idf_keyphrases[bow_b]
print(dcosine(tfidf_a, tfidf_b))

intersection_a, intersection_b = intersection_bow(1000, 3001, corpus_keyphrases, corpus_bag_of_keyphrases)
tfidf_a = model_tf_idf_keyphrases[intersection_a]
tfidf_b = model_tf_idf_keyphrases[intersection_b]
print(dcosine(tfidf_a, tfidf_b))

# print(dcosine(model_tf_idf_keyphrases[intersection_a], model_tf_idf_keyphrases[intersection_b]))

0.0
1.0


Index simmilarities