# Metody Obliczeniowe w Nauce i Technice
## Laboratorium 4 - Singular Value Decomposition (Wyszukiwarka)
### Albert Gierlach

### 1. Przygotowanie danych
Dane przygotowano za pomocą wiki-crawlera. Wykorzystano skrypt w Pythonie (https://github.com/bornabesic/wikipedia-crawler), dostosowując go do potrzeb zadania (dodanie opcji, która pozwala pobrać N artykułów). Źródła (wikipedia.py oraz crawler.py) są dostępne w archiwum z zadaniem.

Użycie:
```
python crawler.py N subdomain
```
gdzie N to liczba dokumentow do pobrania, a 'subdomain' to subdomena (użyto wartości 'en').
Dla polepszenia rezultatów zapewniono, że długość artykułu będzie większa niż 1000 znaków.

Dane w formacie .txt pobierane są do folderu ./data

### 2., 3. Określenie bag-of-words 
Stworzono klasę, która będzie przechowywać dane jednego dokumentu oraz odpowiednie jej metody, które będą wykorzystane później. Odrzucono kilka słów, które powinny zostać zignorowane podczas wyszukiwania artykułów. Stworzono także klasę, która będzie odpowiadać za cache'owanie wyliczonych wektorów i macierzy, gdyż operacja ta trwa dość długo. Zastosowanie takiej klasy pozwala na jednokrotne wyliczenie wartości, a później wystarczy wczytać gotowe dane. Pierwsze uruchomienie trwa max 5 minut.

In [1]:
from collections import Counter
from typing import List, Any
from scipy import sparse
import os
import pickle
import re
import numpy as np
import operator

data_dir = "./data"

In [2]:
class CacheManager:
    cache_dir = "./cache"  # place for storing calculated matrices, etc

    def __init__(self):
        self.loaded = set()

        if not os.path.exists(CacheManager.cache_dir):
            os.makedirs(CacheManager.cache_dir)

    def was_loaded(self, filename):
        return filename in self.loaded

    def save(self, filename, object):
        if self.was_loaded(filename):
            return

        try:
            with open('{}/{}'.format(CacheManager.cache_dir, filename), "wb") as f:
                pickle.dump(object, f, protocol=pickle.HIGHEST_PROTOCOL)
                print("> caching " + filename)
        except:
            return

    def load(self, filename):
        try:
            with open('{}/{}'.format(CacheManager.cache_dir, filename), "rb") as f:
                res = pickle.load(f)
                print("> using cached " + filename)
                self.loaded.add(filename)
                return res
        except:
            return None

class ArticleData:
    ignored_words = ["a", "the", "of", "is"]  # and probably more

    def __init__(self, title):
        self.title = title.split('.')[0]
        self.bag_of_words = Counter()
        self.words_vec = None
        self.words_vec_norm = None

    def load_bag_of_words(self, path):
        with open(path, "rt", encoding='utf-8') as f:
            words = re.findall(r'\w+', f.read().lower())
            loaded_words = [word for word in words if len(word) > 2]
            self.bag_of_words.update(loaded_words)

        for ignore_token in ArticleData.ignored_words:
            del self.bag_of_words[ignore_token]

    def create_full_bag_of_words(self, keyset, size):
        self.words_vec = np.zeros(size)  # d_j
        for i, k in enumerate(keyset):
            self.words_vec[i] = self.bag_of_words[k]

        self.words_vec_norm = np.linalg.norm(self.words_vec)

    def print_contents(self):
        with open('{}/{}.txt'.format(data_dir, self.title), "rt", encoding='utf-8') as f:
            print(f.read())

    def normalize_word_vec(self):
        self.words_vec = self.words_vec / np.linalg.norm(self.words_vec)

In [3]:
cache = CacheManager()

articles_data: List[ArticleData] = cache.load('articles_data.dump')
if articles_data is None:
    articles_data = []
    for file in os.listdir(data_dir):
        a_data = ArticleData(file)
        a_data.load_bag_of_words("{}/{}".format(data_dir, file))
        articles_data.append(a_data)
print("total number of articles {}".format(len(articles_data)))

total_bag_of_words: Counter = cache.load('total_bag_of_words.dump')
if total_bag_of_words is None:
    total_bag_of_words = Counter()
    for article in articles_data:
        total_bag_of_words += article.bag_of_words

sizeof_total = len(total_bag_of_words)
wordset: List[Any] = cache.load('wordset.dump')
if wordset is None:
    wordset = list(total_bag_of_words.keys())
print("total number of words: {}".format(sizeof_total))

print("creating bag of words for every article")
if not cache.was_loaded('articles_data.dump'):
    for article in articles_data:
        article.create_full_bag_of_words(wordset, sizeof_total)
print("created {} bags, every has {} elements".format(len(articles_data), sizeof_total))

> using cached articles_data.dump
total number of articles 1500
> using cached total_bag_of_words.dump
> using cached wordset.dump
total number of words: 68581
creating bag of words for every article
created 1500 bags, every has 68581 elements


### 4., 5.  Rzadka macierz wektorów cech oraz IDF
Do budowy rzadkiej macierzy wykorzystano funckję crs_matrix(). Czas operacji 3-5 minut.

In [4]:
def getIDF(wordset, articles_data):
    articles_num = len(articles_data)
    idf = []
    for word in wordset:
        cnt = 0
        for article in articles_data:
            if article.bag_of_words[word] != 0:
                cnt += 1

        idf.append(np.log10(articles_num/cnt))

    return idf


def create_sparse(articles_data, sizeof_total, idf):
    row = []
    column = []
    data = []

    for i in range(len(articles_data)):
        article = articles_data[i]
        for j in range(sizeof_total):
            if article.words_vec[j] != 0:
                row.append(j)
                column.append(i)
                data.append(article.words_vec[j] * idf[j])


    term_by_document_matirx = sparse.csr_matrix((data, (row, column)), shape=(sizeof_total, len(articles_data)))
    return term_by_document_matirx

In [5]:
print('calculating idf')
idf: List[Any] = cache.load('idf.dump')
if idf is None:
    idf = getIDF(wordset, articles_data)

print('creating sparse matrix')
term_by_document_matirx: sparse.csr_matrix = cache.load('term_by_document_sparse_matrix.dump')
if term_by_document_matirx is None:
    term_by_document_matirx = create_sparse(articles_data, sizeof_total, idf)
print("term by document matrix size: {}x{}".format(term_by_document_matirx.shape[0],
                                                   term_by_document_matirx.shape[1]))

calculating idf
> using cached idf.dump
creating sparse matrix
> using cached term_by_document_sparse_matrix.dump
term by document matrix size: 68581x1500


In [6]:
cache.save('articles_data.dump', articles_data)
cache.save('wordset.dump', wordset)
cache.save('term_by_document_sparse_matrix.dump', term_by_document_matirx)
cache.save('total_bag_of_words.dump', total_bag_of_words)
cache.save('idf.dump', idf)

### 6.  Program pozwalający na wyszukiwanie artykułów
Stworzono nowy plik lab4_search_engine.py, który będzie odpowiedzialny za przetwarzanie danych. Na początku wczytamy zcache'owane dane.

In [8]:
def do_query(query, k, word_list, articles):
    query = query.lower()
    words_dict = {word: index for index, word in enumerate(word_list)}
    words = re.findall(r'\w+', query)

    vec_query = np.zeros(len(word_list), dtype=int)
    for w in words:
        if w in words_dict.keys():
            vec_query[words_dict[w]] += 1

    if not np.any(vec_query):
        print("No results")
        return

    q_norm = np.linalg.norm(vec_query)
    vec_query = vec_query.T
    res = []
    for a in articles:
        divider = q_norm * a.words_vec_norm
        prod = vec_query @ a.words_vec
        cos_theta = prod/divider
        res.append((cos_theta, a))

    res.sort(key=operator.itemgetter(0), reverse=True)
    print("Found articles:")
    for res_entry in res[:k]:
        print('> ' + res_entry[1].title)

    print("\n\nFull articles:")
    for res_entry in res[:k]:
        print(res_entry[1].print_contents())
        print('\n')
        print('*' * 40)
        
        
        
# reassign variables, just for readibility
# articles - list with all of documents (words vectors + bag of words)
# word_list - bag_of_words_dict.keys()
# A - sparse matrix, columns are words vectors from articles_data
articles, word_list, A = articles_data, wordset, term_by_document_matirx

### Przykładowe wyszukania

In [None]:
do_query("Action film", 3, word_list, articles)

In [None]:
do_query("Winston Churchill", 3, word_list, articles)

In [None]:
do_query("Poland", 3, word_list, articles)

### Interaktywna wyszukiwarka

In [12]:
from ipywidgets import Layout, Button, Box, FloatText, Textarea, Text, Label, IntSlider, Output
from IPython.display import display, clear_output

def btn(b):
    output.clear_output()
    with output:
        how_many = form.children[0].children[1].value
        text_to_search = form.children[1].children[1].value
        if len(text_to_search) > 1:
            do_query(text_to_search, how_many, word_list, articles)
        else:
            output.append_stdout("Text is too short")

form_item_layout = Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between'
)

form_items = [
    Box([Label(value='Results num'), IntSlider(min=1, max=30, value=10, descritpion='k_')], layout=form_item_layout),
    Box([Label(value='Query'), Text(placeholder="Wpisz zapytanie", descritpion='query_')], layout=form_item_layout),
    Box([Label(), Button(description="Search!")], layout=form_item_layout)
]

form = Box(form_items, layout=Layout(
    display='flex',
    flex_flow='column',
    align_items='stretch',
    width='50%'
))
output = Output()
form.children[2].children[1].on_click(btn)

In [13]:
form

Box(children=(Box(children=(Label(value='Results num'), IntSlider(value=10, max=30, min=1)), layout=Layout(dis…

In [14]:
display(output) # place for results

Output()

### 7. Normalizacja wektorów

In [None]:
def normalize_vectors(articles):
    for a in articles:
        a.normalize_word_vec()
        
normalize_vectors(articles)