# Similarity Functions

In [1]:
from shingle import *
from math import ceil, floor
import numpy as np

In [2]:
# Initialize counts
frequencies = {}
text = open("data/portuguese/two_ends.txt", "r+")
for line in text:
    word = line.strip().split(' ')
    frequencies[word[0]] = float(word[1])   

## TF - IDF

In [7]:
def tf(intersection, document):
    tf = [document.count(word) for word in intersection]
    return np.array(tf)

In [8]:
def idf(intersection, document, N):
    idf = np.array([frequencies[word] for word in intersection])
    idf = np.log10(np.divide(N + 1, idf + 0.5))
    return idf

In [9]:
def tf_idf(query, document, N):
    intersection = [word for word in document if word in query] # intersection
    score = np.dot(tf(intersection, document), idf(intersection, document, N))
    return score

In [11]:
query = two_ends("pizza", 2)
document = two_ends("pizza", 2)
tf_idf(query, document, 50000)

13.615376041936951

## BM25

In [18]:
def bm25_tf(intersection, query, document, k1, b, avgdl, N):
    tf_ = tf(intersection, document)
    numerator = tf_ * (k1 + 1.0)
    denominator = tf_ + k1 * (1.0 - b + b * (len(query) / avgdl))
    bm25_tf = np.divide(numerator, denominator)
    return bm25_tf

In [19]:
def bm25(query, document, k1 = 1.2, b = 0.75, avgdl = 8.3, N = 50000):
    intersection = [word for word in document if word in query] # intersection
    score = np.dot(bm25_tf(intersection, query, document, k1, b, avgdl, N), idf(intersection, document, N))
    return score

In [24]:
query = two_ends("pizza", 2)
document = two_ends("pizza", 2)
bm25(query, document)

15.356193114624382

## Dirichlet

In [32]:
shingles = 470751
def smooth(intersection, document, mu):
    smooth = []
    for word in intersection:
        prob = 1.0 + np.divide(document.count(word), mu * frequencies[word] / shingles)
        smooth.append(np.log10(prob))
    smooth = np.array(smooth)
    return smooth

In [40]:
def dirichlet(query, document, mu = 100.0):
    intersection = [word for word in document if word in query] # intersection
    add = len(query) * np.log10(np.divide(mu, mu + len(document)))
    score = np.dot(tf(intersection, query), smooth(intersection , document, mu)) + add
    return score

In [41]:
query = two_ends("pizzzza", 2)
document = two_ends("pizzza", 2)
print(dirichlet(query, document))

11.15631567880404
