### Imports

In [41]:
import numpy as np
from scipy.sparse.linalg import svds
import math

### Functions

In [42]:
documents = 1100


def make_dict():
    """ Making dictionary with all words in texts files in "texts" directory. 
        Key is a word and value is its index.
    """
    
    dictionary = {}
    curr_word = 0
    for i in range(documents):
        with open("texts/" + str(i) + ".txt") as file:
            text_words = file.read().split()
            for word in text_words:
                if word not in dictionary:
                    dictionary[word] = curr_word
                    curr_word += 1

    return dictionary


def make_term_by_document_matrix(dictionary):
    """ Creating sparse term-by-document matrix.
    Matrix's row represents a word and column represents a text.
    matrix[i, j] = k means that there is k words having index i in dictionary in text "texts/j.txt"
    """
    
    matrix = np.zeros((len(dictionary), documents), dtype=float)

    for i in range(documents):
        with open("texts/" + str(i) + ".txt") as file:
            text_words = file.read().split()
            for word in text_words:
                matrix[dictionary[word], i] += 1.0

    return matrix


def multiply_by_inverse_document_frequency(matrix):
    """ Multiplying every row of matrix by its inverse document frequency. """
    
    for word_inx in range(matrix.shape[0]):
        documents_with_word = len(matrix[word_inx].nonzero()[0])
        idf = math.log(documents / documents_with_word)
        matrix[word_inx] *= idf


def remove_noise(matrix, k):
    """ Removing noise by singular value decomposition and low rank approximation. """
    
    u, s, vt = svds(matrix, k)

    new_matrix = u @ np.diag(s) @ vt

    return new_matrix


def get_text_vector(text, dictionary, rows, normalize=False):
    """ Creating a bag-of-words for text to search it. """
    
    text_vector = np.zeros((rows, 1))
    for word in text.split():
        text_vector[dictionary[word]] += 1

    if normalize:
        text_vector /= np.linalg.norm(text_vector)

    return text_vector


def search(dictionary, matrix, text, k, normalize=False):
    """ Searching word and returning k texts with the highest probability to contain that text. """

    text_vector = get_text_vector(text, dictionary, matrix.shape[0], normalize)

    if normalize:
        for row in range(matrix.shape[0]):
            norm = np.linalg.norm(matrix[row])
            if norm != 0:
                matrix[row] /= norm

    likelihood = []
    for i in range(matrix.shape[1]):
        word_likelihood = (text_vector.T @ matrix[:, i].reshape(-1, 1))[0, 0] / \
                          (np.linalg.norm(text_vector) * np.linalg.norm(matrix[:, i]))
        likelihood.append((word_likelihood, i))

    likelihood.sort(reverse=True)

    for i in range(k):
        print(f"Text {likelihood[i][1]}, correlation {likelihood[i][0]}")
        

def get_matrix(dictionary, use_idf=False, use_approximation=False, k=1):
    """ Getting term-by-document matrix for searches.
    
    Parameters:
    dictionary - dictionary where key is the word and value is word's index
    use_idf - whether multiply the matrix by inverse document frequency
    use_approximation - if reduce noise by singular value decomposition 
                        and low rank approximation
    k - number of singular values for low rank approximation
    """
    
    matrix = make_term_by_document_matrix(dictionary)

    if use_idf:
        multiply_by_inverse_document_frequency(matrix)

    if use_approximation:
        matrix = remove_noise(matrix, k)

    return matrix

### Tests

#### Making matrices

In [43]:
dictionary = make_dict()

In [44]:
matrix_1 = get_matrix(dictionary, use_idf=False, use_approximation=False)

In [45]:
matrix_2 = get_matrix(dictionary, use_idf=True, use_approximation=False)

In [47]:
matrix_3 = get_matrix(dictionary, use_idf=True, use_approximation=True, k=5)

In [48]:
# it takes ~15s to run
matrix_4 = get_matrix(dictionary, use_idf=True, use_approximation=True, k=100) 

In [82]:
# it takes ~100s to run
matrix_5 = get_matrix(dictionary, use_idf=True, use_approximation=True, k=1050)

#### First text to search
Below is a text from file "texts/1.txt".
It is specific so it is very improbable that any other text contains all these words

In [83]:
text = "Eirene or Irene was an ancient Greek artist described by Pliny the Elder in the 1st century . She was the daughter of a painter"

In [84]:
search(dictionary, matrix_1, text, 5)

Text 995, correlation 0.5690413989424618
Text 1, correlation 0.5677270907634907
Text 728, correlation 0.5530083017624435
Text 592, correlation 0.5421900064192553
Text 298, correlation 0.5413001831392188


In [85]:
search(dictionary, matrix_2, text, 5, True)

Text 1, correlation 0.22208582116215791
Text 695, correlation 0.04430720162672154
Text 520, correlation 0.04336838559555177
Text 322, correlation 0.04113319840960251
Text 760, correlation 0.0362994822931513


In [86]:
search(dictionary, matrix_3, text, 5, True)

Text 808, correlation 0.036249759229291634
Text 197, correlation 0.03613749946173404
Text 150, correlation 0.035986541614283156
Text 543, correlation 0.035721139110840974
Text 371, correlation 0.035593781123569444


In [87]:
search(dictionary, matrix_4, text, 5, True)

Text 1, correlation 0.04980628063417669
Text 874, correlation 0.04671997475337966
Text 427, correlation 0.04347952401523341
Text 117, correlation 0.04342945126128477
Text 904, correlation 0.04301006511715169


In [88]:
search(dictionary, matrix_5, text, 5, True)

Text 1, correlation 0.22205057575099654
Text 695, correlation 0.04414862971767928
Text 520, correlation 0.04328142791459883
Text 322, correlation 0.041018391487350925
Text 760, correlation 0.03635989848520005


#### Second text to search
Below is a text from file "texts/3.txt".
This one contains words which should be common in many texts

In [89]:
text = "and only eight of them are"

In [90]:
search(dictionary, matrix_1, text, 5)

Text 3, correlation 0.38214604368683663
Text 809, correlation 0.37523938719322825
Text 1029, correlation 0.3546040716334876
Text 381, correlation 0.3306121993780585
Text 489, correlation 0.3215206485222378


In [91]:
search(dictionary, matrix_2, text, 5)

Text 3, correlation 0.0752213879854701
Text 950, correlation 0.047897772078110176
Text 473, correlation 0.04378901190067551
Text 381, correlation 0.04163674623547035
Text 556, correlation 0.0400686806019348


In [92]:
search(dictionary, matrix_3, text, 5)

Text 285, correlation 0.020508096788457473
Text 66, correlation 0.02050670627456389
Text 769, correlation 0.020396095511925207
Text 141, correlation 0.020394636664360186
Text 71, correlation 0.02039281599190947


In [93]:
search(dictionary, matrix_4, text, 5)

Text 699, correlation 0.03521951518907277
Text 142, correlation 0.03324513785216377
Text 161, correlation 0.03273482584203671
Text 48, correlation 0.030669424469942863
Text 986, correlation 0.030663423628029537


In [94]:
search(dictionary, matrix_5, text, 5, True)

Text 3, correlation 0.07569354709858046
Text 950, correlation 0.04782960282190271
Text 473, correlation 0.04385518553185537
Text 381, correlation 0.041685742563233735
Text 773, correlation 0.0389129040321094
