In [8]:
import numpy as np
import seaborn as sns
import pandas as pd

documents = []
documents.append("Romeo and Juliet.")
documents.append("Juliet: O happy dagger!")
documents.append("Romeo died by dagger.")
documents.append("\"Live freee or die\", that's the New-Hampshire's motto.")
documents.append("Did you know, New-Hampshire is in New-England.")


def get_with_k(A, k):
    u, s, vh = np.linalg.svd(A)
    #print("calculated", len(s), "SVs", u.shape, vh.shape)
    S = np.zeros((len(s), len(s)))
    np.fill_diagonal(S, s)
    return u[:,:k], S[:k,:k], vh[:k, :]

def run_query(A, k, query_words, scaling=True, euclidean=False, look_for_documents=True):
    q = make_query_from_words(A, k, query_words)
    u, S, vh = get_with_k(A, k)
    # the documents are given by s*vh
    if scaling:
        word_matrices = np.matmul(u, S)
        document_matrices = np.matmul(S, vh).T
    else:
        word_matrices = u
        document_matrices = vh.T
    results_list = []
    if look_for_documents:
        list_to_look_in = document_matrices
    else:
        list_to_look_in = word_matrices
    for i in range(len(list_to_look_in)):
        word_or_doc = list_to_look_in[i]
        if euclidean:
            temp = dist = np.linalg.norm(word_or_doc-q)
        else:
            temp = np.matmul(word_or_doc, q)
            temp = temp/(np.linalg.norm(word_or_doc)*np.linalg.norm(q))
        results_list.append(temp)
    return results_list

def make_query_from_words(matrix, k, words):
    u, S, vh = get_with_k(matrix, k)
    # the documents are given by s*vh
    word_matrices = np.matmul(u, S)
    sum = np.zeros((k))
    counter = 0
    for i in range(len(words)):
        if words[i]:
            sum += word_matrices[i]
            counter += 1
    return sum/counter

A_local = [
     [1,0,1,0,0],
     [1,1,0,0,0],
     [0,1,0,0,0],
     [0,1,1,0,0],
     [0,0,0,1,0],
     [0,0,1,1,0],
     [0,0,0,1,0],
     [0,0,0,1,1]
]
    

In [9]:
# i)
# reconstruct old example
k_local = 2
u, S, vh = get_with_k(A_local, k_local)
print(u)
print(S)
print(vh)
word_matrices = np.matmul(u, S)
print("word_matrices", word_matrices)
document_matrices = np.matmul(S, vh).T
print("document_matrices", document_matrices)

query_words = [0,0,0,1,0,1,0,0]
# q = make_query_from_words(A, k, query_words)
print("Using query q", query_words)
results = run_query(A_local, k_local, query_words)
print("the resulting distances to the documents from the query correspond to:\n", results)

[[-0.39615277  0.28005737]
 [-0.31426806  0.44953214]
 [-0.17823952  0.26899154]
 [-0.43836375  0.36850831]
 [-0.26388058 -0.34592143]
 [-0.52400482 -0.24640466]
 [-0.26388058 -0.34592143]
 [-0.32637322 -0.45966878]]
[[2.28529793 0.        ]
 [0.         2.01025824]]
[[-0.31086574 -0.40733041 -0.59446137 -0.60304575 -0.1428143 ]
 [ 0.36293322  0.54074246  0.20005441 -0.6953914  -0.22866156]]
word_matrices [[-0.90532712  0.56298763]
 [-0.71819615  0.90367568]
 [-0.40733041  0.54074246]
 [-1.00179178  0.74079687]
 [-0.60304575 -0.6953914 ]
 [-1.19750713 -0.49533699]
 [-0.60304575 -0.6953914 ]
 [-0.74586005 -0.92405295]]
document_matrices [[-0.71042084  0.7295895 ]
 [-0.93087134  1.08703198]
 [-1.35852135  0.40216102]
 [-1.37813921 -1.39791629]
 [-0.32637322 -0.45966878]]
Using query q [0, 0, 0, 1, 0, 1, 0, 0]
the resulting distances to the documents from the query correspond to:
 [0.7727964887537564, 0.7306768205359736, 0.9844359912676066, 0.6187306127613201, 0.4849183185073809]


Just as in the example I find that the 3. Text "Romeo died by dagger." best fits the query. Followed by the first 2 texts with similar values. 

In [10]:
# ii) a)
# modefy k

for k_a in range(2, 6):
    results = run_query(A_local, k_a, query_words)
    print("going with k", k_a)
    print(results)

going with k 2
[0.7727964887537564, 0.7306768205359736, 0.9844359912676066, 0.6187306127613201, 0.4849183185073809]
going with k 3
[0.7790887632803176, 0.4962370290822853, 0.9206238960480264, 0.5742675138280299, 0.31817696222835556]
going with k 4
[0.2598311922704977, 0.5214034484031386, 0.9153977388771566, 0.5173749167851798, 0.14161281019977673]
going with k 5
[0.25757392536334167, 0.5208900313918557, 0.9129757621476984, 0.5099090479804408, 0.11282240858457236]


Larger k seem to confuse the system. At k >= 4 there seem only to stay the literal readings of "die" or "dagger" as good fits. I expect this behavior due to be due to k describing the number of concepts understood. If k is small, then similar words as Romeo or Juliet are considered almost no different than die and dagger. Once more dimentions are allowed the difference between them becomes more pronounced.

In [11]:
# ii) b)
# omitting the scaling step
print("with scaling   ", run_query(A_local, k_local, query_words, scaling=True))
print("without scaling", run_query(A_local, k_local, query_words, scaling=False))

with scaling    [0.7727964887537564, 0.7306768205359736, 0.9844359912676066, 0.6187306127613201, 0.4849183185073809]
without scaling [0.7307535536972931, 0.6865585886766306, 0.9773001857547076, 0.5673208751646639, 0.43238767985684207]


Both in this example and in the wikipedia analysis below scaling seems to have only a small impact

In [12]:
# ii) c)
# omitting words that only occur in a single document
A_no_single_apperances = [
     [1,0,1,0,0],
     [1,1,0,0,0], # third removed
     [0,1,1,0,0],# fifth removed
     [0,0,1,1,0],# seventh removed
     [0,0,0,1,1]
]
A_additional_single_words = [
     [1,0,1,0,0],
     [1,1,0,0,0],
     [0,1,0,0,0],
     [0,1,1,0,0],
     [0,0,0,1,0],
     [0,0,1,1,0],
     [0,0,0,1,0],
     [0,0,0,1,1],
     [0,0,0,1,0],
     [0,0,1,0,0],
     [1,0,0,0,0]
]
A_additional_words = [
     [1,0,1,0,0],
     [1,1,0,0,0],
     [0,1,0,0,0],
     [0,1,1,0,0],
     [0,0,0,1,0],
     [0,0,1,1,0],
     [0,0,0,1,0],
     [0,0,0,1,1],
     [1,0,1,1,1],# this is the word "."
     [0,0,0,1,1], #this is the ","
     [1,1,0,0,0] # this does not correspond to something
]
print("original            ", run_query(A_local, k_local, query_words))
print("no_single_apperances", run_query(A_no_single_apperances, k_local, [0,0,1,1,0]))# note that the query words needs to be changed
print("A_additional_single_words", run_query(A_additional_single_words, k_local, query_words))
print("A_additional_words", run_query(A_additional_words, k_local, query_words))

original             [0.7727964887537564, 0.7306768205359736, 0.9844359912676066, 0.6187306127613201, 0.4849183185073809]
no_single_apperances [0.6722877510014041, 0.6722877510014048, 0.9902753662967518, 0.6817842761697039, 0.48229661925975065]
A_additional_single_words [0.7436413533210913, 0.7436413533210912, 0.9696659529652373, 0.5975427749192898, 0.4692685032998632]
A_additional_words [0.8992251356798536, 0.6518396638922946, 0.9997255585156124, 0.6787131631456086, 0.7141359574633465]


Adding or removing words, changes the exact situation. However the similarities between texts and keywords, stay approximately the same, as long as no new word is linking texts. This holds true for different k also. (Note that upon removing words, the query words vector must be changed to fit!)

In [13]:
# ii) d)
# use eucleadian metric
print("with cosine    metric", run_query(A_local, k_local, query_words, euclidean=False))
print("with euclidean metric", run_query(A_local, k_local, query_words, euclidean=True))

with cosine    metric [0.7727964887537564, 0.7306768205359736, 0.9844359912676066, 0.6187306127613201, 0.4849183185073809]
with euclidean metric [0.7209559279938733, 0.9789609241466175, 0.38091519694220033, 1.5459370903491112, 0.9680621858502625]


The euclidean metric involes direct distances, that have serval downsides. The euclidean distance may reach arbitrairily large numbers (this may be fixed by norming it), is less percisely resolved in the relevant range and measures what one may intuitivly describe as "absolute" distances. Since here we are interested in differences, not absolute distances, this will signifikantly confuse.
Angular distances on the other hand are well normed to be below 1 and take into account/compensate for both the length of the texts and the query.

In [17]:
# iii) larger dataset

wiki = []
wiki.append("The Federal City of Bonn is a city on the banks of the Rhine in the German state of North Rhine-Westphalia, with a population of over 300,000. About 24 km south-southeast of Cologne, Bonn is in the southernmost part of the Rhine-Ruhr region, Germany's largest metropolitan area, with over 11 million inhabitants. It is famously known as the birthplace of Ludwig van Beethoven in 1770. Beethoven spent his childhood and teenage years in Bonn.")
wiki.append("Aachen, also known as Bad Aachen (\"Aachen Spa\"), and in French and traditional English as Aix-la-Chapelle, is a spa and border city in North Rhine-Westphalia, Germany. Aachen developed from a Roman settlement and spa, subsequently becoming the preferred medieval Imperial residence of Charlemagne, and, from 936 to 1531, the place where 31 Holy Roman Emperors were crowned Kings of the Germans.")
wiki.append("Duisburg is a city of about 500,000 inhabitants in Germany’s Rhineland, at the confluence of the Rhine and the Ruhr. In medieval times, it was a member of the powerful Hanseatic League, and later became a major centre of iron, steel, and chemicals. For this reason, it was heavily bombed in World War II. Today it boasts the world's largest inland port, with 21 docks and 40 kilometres of wharf. The city supports a large Turkish community.")
wiki.append("Münster is an independent city in North Rhine-Westphalia, Germany. It is in the northern part of the state and is considered to be the cultural centre of the Westphalia region. It is also capital of the local government region Münsterland. Münster was the location of the Anabaptist rebellion during the Protestant Reformation and the site of the signing of the Treaty of Westphalia ending the Thirty Years' War in 1648. Today it is known as the bicycle capital of Germany.")
wiki.append("Dortmund is, with a population of 586,600 (2017), the third-largest city of Germany's most populous federal state of North Rhine-Westphalia after Cologne and Düsseldorf, and Germany's eighth-largest city. It is the largest city (by area and population) of the Ruhr, Germany's largest urban area with some 5.1 million (2011) inhabitants, as well as the largest city of Westphalia. On the Emscher and Ruhr rivers (tributaries of the Rhine), it lies in the Rhine-Ruhr Metropolitan Region and is considered the administrative, commercial, and cultural centre of the eastern Ruhr.")
wiki.append("Ludwig van Beethoven was a German composer and pianist. A crucial figure in the transition between the classical and romantic eras in classical music, he remains one of the most recognized and influential musicians of this period, and is considered to be one of the greatest composers of all time. ")
wiki.append("Franz Joseph Haydn was an Austrian composer of the Classical period. He was instrumental in the development of chamber music such as the piano trio. His contributions to musical form have earned him the epithets \"Father of the Symphony\" and \"Father of the String Quartet\". He was a friend and mentor of Mozart, a tutor of Beethoven, and the older brother of composer Michael Haydn.")
wiki.append("Haribo is a German confectionery company founded in 1920 by Johannes \"Hans\" Riegel, Sr. It began in Kessenich, Bonn, North Rhine-Westphalia; the name is an acronym formed from Hans Riegel, Bonn. The company created the first gummy candy in 1922 in the form of little gummy bears called Gummibärchen. The current headquarters in is Grafschaft, Rhineland-Palatinate, Germany.")

wiki.append("Classical music is art music produced or rooted in the traditions of Western culture, including both liturgical (religious) and secular music. While a more precise term is also used to refer to the period from 1750 to 1820 (the Classical period), this article is about the broad span of time from before the 6th century AD to the present day, which includes the Classical period and various other periods.[1] The central norms of this tradition became codified between 1550 and 1900, which is known as the common-practice period.")
wiki.append("This is a very short fake article about rock music. Rock, metal and Münster.")

def text_to_list(text):
    list_to_return = []
    text = clean_word(text)# we only deal with lowercase words, to have words at the start of a sentence and at the end be equal
    for word in text.split(): # split at all types of whitespace
        # word = clean_word(word)
        list_to_return.append(word)
    # alternativly split at various fixed lengths
    # this could also be done not on word level but on the full text with much longer length.
    #    for length in range(3,10):
    #        for char_index in range(len(word)-length):
    #            list_to_return.append( word[char_index:char_index+length] )
    return list_to_return

def clean_word(word):
    word = word.lower()
    word = word.replace(",", "") # remove some symbols
    word = word.replace(".", "")
    word = word.replace(";", "")
    word = word.replace("!", "")
    word = word.replace("?", "")
    word = word.replace(":", "")
    word = word.replace("\"", "")
    word = word.replace("(", "")
    word = word.replace(")", "")
    return word

list_of_all_words = []
wiki_matrix = []
for wiki_index in range(len(wiki)):
    text = wiki[wiki_index]
    for word in text_to_list(text):
        if word in ["in", "and", "or", "the", "a", "is", "of", "with", "as", "at", "this", "about"]:
            continue
        if word not in list_of_all_words:
            list_of_all_words.append(word)
        
        # add it to the matrix
        word_index = list_of_all_words.index(word)
        if len(wiki_matrix) == word_index:
            wiki_matrix.append(np.zeros(len(wiki)))
        wiki_matrix[word_index][wiki_index] += 1
        # this gives A

def print_data():
    for word_index in range(len(wiki_matrix)):
        to_print = str(word_index)+" "+list_of_all_words[word_index]
        for wiki_index in range(len(wiki)):
            to_print += " "+str(wiki_matrix[word_index][wiki_index])
        print(to_print)

def get_query_vector(search_string):
    search_string = clean_word(search_string)
    vector = np.zeros(len(list_of_all_words))
    for word_index in range(len(list_of_all_words)):
        word = list_of_all_words[word_index]
        if word in text_to_list(search_string): # todo repeated words
            vector[word_index] += 1
    return vector

def wiki_search(search_string, amount_of_results, k=7):
    vector = get_query_vector(search_string)
    # print(vector)
    word_similarity_vector = np.array( run_query(wiki_matrix, k, vector, look_for_documents=False) )
    #print("word_similarity_vector", word_similarity_vector)
    doc_similarity_vector = np.array( run_query(wiki_matrix, k, vector, look_for_documents=True, scaling=True) )
    #print("doc_similarity_vector", doc_similarity_vector)

    #print("Top similar words to the query >"+search_string+"< are:")
    #for word_index in (-word_similarity_vector).argsort()[:amount_of_results]:
    #    print(list_of_all_words[word_index])

    print("Top similar texts to the query >"+search_string+"< are:")
    for wiki_index in (-doc_similarity_vector).argsort()[:amount_of_results]:
        print(wiki_index, ":", wiki[wiki_index])
    return

# print_data()
wiki_search("classical music", 5, k=5)


Top similar texts to the query >classical music< are:
8 : Classical music is art music produced or rooted in the traditions of Western culture, including both liturgical (religious) and secular music. While a more precise term is also used to refer to the period from 1750 to 1820 (the Classical period), this article is about the broad span of time from before the 6th century AD to the present day, which includes the Classical period and various other periods.[1] The central norms of this tradition became codified between 1550 and 1900, which is known as the common-practice period.
9 : This is a very short fake article about rock music. Rock, metal and Münster.
5 : Ludwig van Beethoven was a German composer and pianist. A crucial figure in the transition between the classical and romantic eras in classical music, he remains one of the most recognized and influential musicians of this period, and is considered to be one of the greatest composers of all time. 
6 : Franz Joseph Haydn was a

Insights:

creation of a good search engine ist hard. The method described mostly links due to words like "the", "or", "and" etc. since they are frequent. A much larger database might reduce this issue, but the more promising approach is to exclude "common" words from the analysis. I.e. before removing them, the entry for "Duisburg" was often the best fit for my queries regarding classical music, since it shares many word of the "is" "a" "of" etc. type with the texts refering to classical music. Once they are removed, the more appropriate hits of Beethoven and Bonn are before Duisburg. Also when taking sequences of words into account, instead of just sequences inside words (as I do here), sequences of "in the" etc. pollute the anaysis.

Also I find that, contrary to the example before, a k value of about 5 to 6 is useful in finding propper connections, when dealing with only the 8 original wikipedia texts.