### Imports

In [149]:
import nltk
import numpy as np
import scipy
from scipy.sparse.linalg import svds
import math

### Preprocessing

In [150]:
articles_num = 2000

def separate_text_file():
    """ Separating text.txt file with many articles into separate articles"""
    with open("application/text.txt") as file:
        text = file.read()

        # every article is starting from "@@[number]" so it is good to split text using "@@"
        articles = text.split("@@")

        # to remove initial number in every article
        numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

        curr_article = 0

        for inx in range(1, len(articles)):
            if len(articles[inx]) > 972:    # to get exactly 2000 shortest articles from above 4000 in "text.txt"
                continue

            article_file = open("application/texts/" + str(curr_article) + ".txt", "w")

            # sometimes there is " @ @ @ @ " in text so here is removed
            articles[inx] = articles[inx].replace("@ ", "")

            # every article starts with number, this loop is to remove this number
            letter = 0
            while articles[inx][letter] in numbers:
                letter += 1

            # writing article to the file
            article_file.write(articles[inx][letter+1:])
            article_file.close()

            curr_article += 1


def make_dictionary_and_matrix():
    """ Making dictionary with all words in articles files in "texts" directory.
        Key is a word and value is its index.

        Then creating sparse term-by-document matrix.
        Matrix's row represents a word and column represents an article.
        matrix[i, j] = k means that there is k words having index i in dictionary in text "texts/j.txt"
    """

    # result dictionary with all words in all articles
    dictionary = {}

    # to have an index of next word in dictionary
    curr_word = 0

    # stop words to remove them from dictionary
    stop_words = set(nltk.corpus.stopwords.words('english'))

    # object which will stem the words
    porter_stemmer = nltk.stem.porter.PorterStemmer()

    words_by_article = []
    for i in range(articles_num):
        with open("application/texts/" + str(i) + ".txt") as file:
            # article text from file
            text = file.read()

            # text split on words using nltk library
            word_tokens = nltk.tokenize.word_tokenize(text)

            # filtering words to remove these which are "stop words"
            filter_words = list(filter(lambda w: w not in stop_words, word_tokens))

            # stemming words
            stemmed_words = list(map(lambda w: porter_stemmer.stem(w), filter_words))

            # writing new words to dictionary
            for word in stemmed_words:
                if word not in dictionary:
                    dictionary[word] = curr_word
                    curr_word += 1

            # to write them later to the matrix
            words_by_article.append(stemmed_words)

    # making sparse matrix
    matrix = scipy.sparse.lil_matrix((len(dictionary), articles_num), dtype=float)

    # filling matrix
    for i in range(articles_num):
        for word in words_by_article[i]:
            matrix[dictionary[word], i] += 1

    # converting to csr matrix for faster operations
    matrix = scipy.sparse.csr_matrix(matrix)

    return matrix, dictionary


def multiply_by_inverse_document_frequency(matrix):
    """ Multiplying every row of matrix by its inverse document frequency. """

    matrix = scipy.sparse.csr_matrix.toarray(matrix)

    for word_inx in range(matrix.shape[0]):
        articles_with_word = len(matrix[word_inx].nonzero()[0])
        idf = math.log(articles_num / articles_with_word)
        matrix[word_inx] *= idf

    return scipy.sparse.csr_matrix(matrix)


def remove_noise(matrix, k):
    """ Removing noise by singular value decomposition and low rank approximation. """
    matrix = scipy.sparse.csc_matrix(matrix)
    u, s, vt = svds(matrix, k)

    # it will be numpy array
    new_matrix = u @ np.diag(s) @ vt

    return new_matrix


def get_matrix_and_dictionary(use_idf=False, reduce_noise=False, k=1):
    """ Getting term-by-document matrix for searches and dictionary with all words.

    Parameters:
    use_idf - whether multiply the matrix by inverse document frequency
    use_approximation - if reduce noise by singular value decomposition
                        and low rank approximation
    k - number of singular values for low rank approximation
    """

    matrix, dictionary = make_dictionary_and_matrix()

    if use_idf:
        matrix = multiply_by_inverse_document_frequency(matrix)
    else:
        matrix = scipy.sparse.csr_matrix.toarray(matrix)

    if reduce_noise:
        matrix = remove_noise(matrix, k)
    elif type(matrix) is not np.ndarray:
        matrix = scipy.sparse.csr_matrix.toarray(matrix)

    # normalization using numpy array
    for column in range(matrix.shape[1]):
        norm = np.linalg.norm(matrix[:, column])
        if norm != 0:
            matrix[:, column] /= norm

    # converting to sparse matrix for future operations
    matrix = scipy.sparse.csc_matrix(matrix)

    return matrix, dictionary

### Search functions

In [151]:
def get_text_vector(dictionary, text):
    """ Creating a bag-of-words for text to search it. """

    # stop words to remove them from the text to search
    stop_words = set(nltk.corpus.stopwords.words('english'))

    # object which will stem the words
    porter_stemmer = nltk.stem.porter.PorterStemmer()

    # text split on words using nltk library
    word_tokens = nltk.tokenize.word_tokenize(text)

    # filtering words to remove these which are "stop words"
    filter_words = list(filter(lambda w: w not in stop_words, word_tokens))

    # stemming words
    stemmed_words = list(map(lambda w: porter_stemmer.stem(w), filter_words))

    # bag-of-words vector of the text to search
    text_vector = scipy.sparse.lil_matrix((len(dictionary), 1), dtype=float)

    # adding words occurrence to the vector
    for word in stemmed_words:
        if word in dictionary:
            text_vector[dictionary[word], 0] += 1

    # converting to csr matrix for faster operations
    text_vector = scipy.sparse.csc_matrix(text_vector)

    # normalization
    text_vector /= scipy.sparse.linalg.norm(text_vector)

    return text_vector


def search(dictionary, matrix, text, k):
    """ Searching word and returning k texts with the highest probability to contain that text. """

    # getting bag-of-words vector for the text to search
    text_vector = get_text_vector(dictionary, text)

    # getting correlation for every article
    correlation = []
    for i in range(matrix.shape[1]):
        word_correlation = (text_vector[:, 0].T @ matrix[:, i])[0, 0]

        correlation.append((word_correlation, i))

    # sorting to get the most suitable articles
    correlation.sort(reverse=True)

    # printing result
    for i in range(k):
        print(f"Article: texts/{correlation[i][1]}.txt, correlation {correlation[i][0]}")



### Tests

#### Making matrices

In [152]:
matrix_1, _ = get_matrix_and_dictionary(use_idf=False, reduce_noise=False)

In [153]:
matrix_2, _ = get_matrix_and_dictionary(use_idf=True, reduce_noise=False)

In [154]:
# it takes ~10s to run
matrix_3, _ = get_matrix_and_dictionary(use_idf=True, reduce_noise=True, k=50)

In [178]:
# it takes ~10s to run
matrix_4, _ = get_matrix_and_dictionary(use_idf=True, reduce_noise=True, k=250)

In [156]:
# it takes ~30s to run
matrix_5, dictionary = get_matrix_and_dictionary(use_idf=True, reduce_noise=True, k=750)

#### First text to search
Below is a text from file "texts/1.txt".

In [176]:
text = "becomes King of Assyria"

In [158]:
search(dictionary, matrix_1, text, 5)

Article: texts/1.txt, correlation 0.35043832202523123
Article: texts/20.txt, correlation 0.3198010745334157
Article: texts/58.txt, correlation 0.18461625854057045
Article: texts/465.txt, correlation 0.1825741858350554
Article: texts/22.txt, correlation 0.1404878717372541


In [159]:
search(dictionary, matrix_2, text, 5)

Article: texts/1.txt, correlation 0.42601397501526733
Article: texts/20.txt, correlation 0.2923672825251265
Article: texts/58.txt, correlation 0.28484343132098255
Article: texts/465.txt, correlation 0.14441769309426203
Article: texts/22.txt, correlation 0.13167807681306165


In [179]:
search(dictionary, matrix_3, text, 5)

Article: texts/20.txt, correlation 0.19637239726209893
Article: texts/58.txt, correlation 0.19509811414321251
Article: texts/1487.txt, correlation 0.18868143146140798
Article: texts/604.txt, correlation 0.18784921718156544
Article: texts/581.txt, correlation 0.18784548114234678


In [180]:
search(dictionary, matrix_4, text, 5)

Article: texts/1.txt, correlation 0.3798343225071507
Article: texts/20.txt, correlation 0.3670825688422151
Article: texts/22.txt, correlation 0.31005959610080097
Article: texts/465.txt, correlation 0.2914002772705761
Article: texts/58.txt, correlation 0.2764393679409319


In [162]:
search(dictionary, matrix_5, text, 5)

Article: texts/1.txt, correlation 0.4176139102982707
Article: texts/20.txt, correlation 0.3079106724704089
Article: texts/465.txt, correlation 0.3060938859713124
Article: texts/58.txt, correlation 0.28335615105823503
Article: texts/793.txt, correlation 0.1542262083612811


#### Second text to search
Below is a text from file "texts/0.txt".

In [163]:
text = "higher values mean"

In [164]:
search(dictionary, matrix_1, text, 5)

Article: texts/0.txt, correlation 0.09141414530040079
Article: texts/1885.txt, correlation 0.0912870929175277
Article: texts/187.txt, correlation 0.08006407690254358
Article: texts/602.txt, correlation 0.06666666666666667
Article: texts/571.txt, correlation 0.06225728063646904


In [165]:
search(dictionary, matrix_2, text, 5)

Article: texts/1885.txt, correlation 0.10626668533334137
Article: texts/0.txt, correlation 0.10244915526154882
Article: texts/187.txt, correlation 0.08289186061947922
Article: texts/602.txt, correlation 0.06822502392658282
Article: texts/62.txt, correlation 0.06652606205783809


In [166]:
search(dictionary, matrix_3, text, 5)

Article: texts/0.txt, correlation 0.029587940330816215
Article: texts/220.txt, correlation 0.02653844362333795
Article: texts/713.txt, correlation 0.024265018356019186
Article: texts/452.txt, correlation 0.02388917125647607
Article: texts/704.txt, correlation 0.02326578412824576


In [167]:
search(dictionary, matrix_4, text, 5)

Article: texts/0.txt, correlation 0.10333551373910102
Article: texts/305.txt, correlation 0.047408418377526435
Article: texts/742.txt, correlation 0.0429915991243701
Article: texts/575.txt, correlation 0.04027561526219685
Article: texts/550.txt, correlation 0.039880665861355906


In [168]:
search(dictionary, matrix_5, text, 5)

Article: texts/0.txt, correlation 0.10291575489625168
Article: texts/1563.txt, correlation 0.06441609935990149
Article: texts/187.txt, correlation 0.06257364968453359
Article: texts/1455.txt, correlation 0.059649529580734204
Article: texts/62.txt, correlation 0.05778488748151628


#### Third text to search
Below is also a text from file "texts/0.txt".

In [169]:
text = "other factors are the same"

In [170]:
search(dictionary, matrix_1, text, 5)

Article: texts/212.txt, correlation 0.06900655593423542
Article: texts/186.txt, correlation 0.06551217820804184
Article: texts/0.txt, correlation 0.05277798139692595
Article: texts/459.txt, correlation 0.04612656040144425
Article: texts/1091.txt, correlation 0.04339630366027462


In [171]:
search(dictionary, matrix_2, text, 5)

Article: texts/212.txt, correlation 0.12180166260497147
Article: texts/186.txt, correlation 0.11181890537025016
Article: texts/1091.txt, correlation 0.08867986475618805
Article: texts/459.txt, correlation 0.08352825451532035
Article: texts/0.txt, correlation 0.06217175333778234


In [172]:
search(dictionary, matrix_3, text, 5)

Article: texts/427.txt, correlation 0.04322494531915992
Article: texts/1509.txt, correlation 0.01790222565458324
Article: texts/757.txt, correlation 0.015465973585733666
Article: texts/0.txt, correlation 0.01482603454055907
Article: texts/901.txt, correlation 0.01455420094866487


In [173]:
search(dictionary, matrix_4, text, 5)

Article: texts/0.txt, correlation 0.061466874995638514
Article: texts/459.txt, correlation 0.045212107250326365
Article: texts/427.txt, correlation 0.0441921417571892
Article: texts/1091.txt, correlation 0.04191379035935991
Article: texts/1016.txt, correlation 0.03719579183736633


In [174]:
search(dictionary, matrix_5, text, 5)

Article: texts/186.txt, correlation 0.10352905236651629
Article: texts/459.txt, correlation 0.08175068707509976
Article: texts/1091.txt, correlation 0.07899041060576374
Article: texts/212.txt, correlation 0.07319548708601338
Article: texts/0.txt, correlation 0.062318030717708646
