In [None]:
# This is a micro example of two concepts:
# 1. The "Universal Unicode" that catalogs concepts beyond language
# 2. Spacial Information Storage of research papers based on their keywords

In [24]:
# "Universal Unicode"
# The function for combining concepts needs to be both: invertible and bijective
# Therefore prime decomposition seems like a viable option

# HIDDEN ID <---> CONCEPT
# HIDDEN ID <---> ID via prime indexing
# thus ID <---> CONCEPT

# Also HIDDEN ID <---> (ID + ID) via prime decomposition
# thus CONCEPT <---> (ID + ID)

def getPrimesList(n):
    out = list()
    sieve = [True] * (n+1)
    for p in range(2, n+1):
        if (sieve[p]):
            out.append(p)
            for i in range(p, n+1, p):
                sieve[i] = False
    return out
prime_list = getPrimesList(1000000)

class concept():
    def __init__(self, ID):
        self.ID = ID
        self.hiddenID = prime_list[ID]
    def __add__(self, other):
        newID = self.hiddenID * other.hiddenID 
        # newID = prime_list.index(newHiddenID)
        return concept(newID)
    def __sub__(self, other):
        newHiddenID = self.ID/other.hiddenID
        newID = prime_list.index(newHiddenID)
        return concept(newID)
    def __str__(self):
        return "{0}".format(self.ID)
        

# Fundamental concept
monarch = concept(1)
man = concept(2)
woman = concept(3)
child = concept(4)

king = man + monarch
queen = woman + monarch
son = man + child
daughter = woman + child
print("Child has ID: ", child)
print("Son has ID: ", son)
print("King has ID: ", king)

prince = king + son
print("Prince has ID: ", prince)

print("Prince minus king = ", prince - king)
print("Prince minus king minus man = ", prince - king - man)


Child has ID:  4
Son has ID:  55
King has ID:  15
Prince has ID:  13939
Prince minus king =  55
Prince minus king minus man =  4


In [52]:
# Articles can be defined as a vector of eywords and occurances of each keyword
# Thus, the distance between articles can be calculated with Universal Unicode
# Euclidean distance would not be accurate so an alternate distance is used

# Suppose we have an article about Princes
# The article's keyword vector is as follows
import numpy as np

article1 = {"king": 10, "man": 5, "son": 2}
occurances1 = [10, 5, 2]
article2 = {"king": 3, "man": 10, "son": 2}
occurances2 = [3, 10, 2]
keywords = [king, man, son]

# We use each keyword as a dimension. We use each occurance value as a weight
def getFactors(n):
    factors = []
    i = 2
    n0 = n
    while i * i <= n:
        if n % i:
            i += 1  
        else:
            factors = factors + [n//i]
            n //= i
    if len(factors) < 1:
        factors = [n0]
    return factors

def getDistance(occurances1, occurances2, keywords):
    id_factors = []
    hidden_factors = []
    for i in keywords:
        id_factors = id_factors + [getFactors(i.ID)]
        hidden_factors = hidden_factors + [getFactors(i.hiddenID)]
    relation_matrix = np.zeros([len(keywords), len(keywords)])
    print(id_factors)
    print(hidden_factors)
    for i in range(len(id_factors)):
        for j in range(len(id_factors)):
            relation_matrix[i][j] = len(list(set(id_factors[i]).intersection(hidden_factors[j])))
    print(relation_matrix)
    vector1 = np.multiply(occurances1, relation_matrix)
    vector2 = np.multiply(occurances2, relation_matrix)
    dist = np.linalg.norm(vector1 - vector2)
    return dist
    
print("Distance:", getDistance(occurances1, occurances2, keywords))

article1 = {"king": 10, "man": 5, "son": 2}
occurances1 = [10, 5, 2]
article2 = {"king": 15, "man": 8, "son": 10}
occurances2 = [15, 9, 10]
keywords = [king, man, son]

print("Distance:", getDistance(occurances1, occurances2, keywords))

[[5], [2], [11]]
[[53], [5], [263]]
[[0. 1. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Distance: 5.0
[[5], [2], [11]]
[[53], [5], [263]]
[[0. 1. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Distance: 4.0
