In [1]:
# -*- coding: utf-8 -*-
# -*- authors : Vincent Roduit -*-
# -*- date : 2024-09-30 -*-
# -*- Last revision: 2024-09-30 by Vincent Roduit -*-
# -*- python version : 3.9.19 -*-
# -*- Description: Constants used in the code *-

: 

# <center> CS - 423: Distributed Information Systems </center>
## <center> Ecole Polytechnique Fédérale de Lausanne </center>
### <center>Project 1: Document Retrieval </center>
---

In [2]:
import nltk

In [3]:
#import libraries
import pandas as pd
import os
from nltk.corpus import stopwords
import pickle as pkl
import nltk
from nltk.stem import PorterStemmer
import string
from tqdm import tqdm
import math
from collections import Counter


# automatically reload the module
%load_ext autoreload
%autoreload 2

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goali\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
import os
from nltk.corpus import stopwords

# Check available stopwords
print(stopwords.fileids())

# Check if Korean stopwords file exists
korean_stopwords_path = os.path.join(nltk.data.find("corpora/stopwords"), 'korean')
print(os.path.exists(korean_stopwords_path))


['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'korean', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']
True


In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\goali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# 1. Declaring constants

In [7]:
# Path to the data folder
if os.path.exists("../data"):
    DATA_FOLDER = "../data"
else:
    # create the data folder
    os.mkdir("../data")

CORPUS = os.path.join(DATA_FOLDER, "corpus", "corpus.json")
CORPUS_PKL = os.path.join(DATA_FOLDER, "pickles", "corpus.pkl")


LANG = {
    "en": "english",
    "fr": "french",
    "de": "german",
    "es": "spanish",
    "it": "italian",
    "ko": "korean",
    "ar": "arabic",
}

# 2. Loading Data

In [8]:
def save_data(data: any, file_name: str, folder: str = os.path.join(DATA_FOLDER, "pickles")):
    """
    Save the data to a file
    Args:

    * data (any): the data to save

    * file_name (str): the name of the file

    * folder (str): the folder where to save the file
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

    file_path = os.path.join(folder, file_name)

    with open(file_path, 'wb') as handle:
        pkl.dump(data, handle)

def load_data(file_name: str, folder: str = os.path.join(DATA_FOLDER, "pickles")) -> any:
    """
    Load the data from a file
    Args:

    * file_name (str): the name of the file

    * folder (str): the folder where to save the file

    Returns:

    * any: the data
    """
    file_path = os.path.join(folder, file_name)

    with open(file_path, 'rb') as handle:
        data = pkl.load(handle)

    return data

In [35]:
# load the corpus
if os.path.exists(CORPUS_PKL):
    print("Loading the corpus from the pickle file")
    corpus = load_data("corpus.pkl")
else:
    print("Loading the corpus from the json file")
    corpus = pd.read_json(CORPUS)

Loading the corpus from the pickle file


# 3. Preprocessing

In [14]:
corpus = corpus[:200] 

In [None]:
corpus

In [16]:
stemmer = PorterStemmer() 
def tokenize(text, lang="english"):
    """
    It tokenizes and stems an input text.
    
    :param text: str, with the input text
    :return: list, of the tokenized and stemmed tokens.
    """
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    return [stemmer.stem(word.lower()) for word in tokens if word not in stopwords.words(lang)]

In [None]:
tqdm.pandas()

# Apply the tokenization
corpus['tokenized'] = corpus.progress_apply(lambda row: tokenize(row['text'], lang=LANG[row['lang']]), axis=1)

In [19]:
corpus_list = corpus['tokenized'].tolist()

In [23]:
save_data(corpus_list, "corpus_list.pkl")
save_data(corpus, "corpus.pkl")

In [24]:
corpus_Fabio_list = load_data("corpus_list.pkl")
corpus_Fabio = load_data("corpus.pkl")

In [None]:
import spacy
from tqdm import tqdm

# Load SpaCy models for each language
SPACY_MODELS = {
    'english': spacy.load('en_core_web_sm'),
    'french': spacy.load('fr_core_news_sm'),
    'german': spacy.load('de_core_news_sm'),
    'spanish': spacy.load('es_core_news_sm'),
    'italian': spacy.load('it_core_news_sm'),
    'korean': spacy.load('ko_core_news_sm'),
    
}

# Tokenization function using SpaCy
def tokenize_spacy(text, lang="english"):
    """
    Tokenizes an input text using SpaCy model for the specified language.
    
    :param text: str, input text
    :param lang: str, language code for SpaCy model
    :return: list of tokens
    """
    nlp = SPACY_MODELS[lang]
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

tqdm.pandas()
corpus['tokenized'] = corpus.progress_apply(
    lambda row: tokenize_spacy(row['text'], lang=LANG[row['lang']]), axis=1
)



# 4. Create the TF-IDF matrix

In [27]:
# Functions for the TF/IDF implementation

def idf_values(vocabulary, documents):
    """
    It computes IDF scores, storing idf values in a dictionary.
    
    :param vocabulary: list of str, with the unique tokens of the vocabulary.
    :param documents: list of lists of str, with tokenized sentences.
    :return: dict with the idf values for each vocabulary word.
    """
    idf = {}
    num_documents = len(documents)
    
    # Adding tqdm progress bar for the loop
    for term in tqdm(vocabulary, desc="Calculating IDF values"):
        idf[term] = math.log(num_documents / sum([1 for d in documents if term in d]), math.e)
    
    return idf

def vectorize(document, vocabulary, idf):
    """
    It generates the vector for an input document (with normalization).
    
    :param document: list of str with the tokenized documents.
    :param vocabulary: list of str, with the unique tokens of the vocabulary.
    :param idf: dict with the idf values for each vocabulary word.
    :return: list of floats
    """
    vector = [0]*len(vocabulary)
    counts = Counter(document)
    max_count = counts.most_common(1)[0][1]
    for i,term in enumerate(vocabulary):
        vector[i] = idf[term] * counts[term] / max_count
    return vector

def cosine_similarity(v1,v2):
    """
    It computes cosine similarity.
    
    :param v1: list of floats, with the vector of a document.
    :param v2: list of floats, with the vector of a document.
    :return: float
    """
    
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if sumxy == 0:
            result = 0
    else:
            result =  sumxy / math.sqrt(sumxx*sumyy)
    return result

def search_vec(query, topk, corpus, idf, vocabulary, document_vectors):
    """
    It computes the search result (get the topk documents).
    
    :param query: str
    :param topk: int
    """
    q = query.split()
    q = [stemmer.stem(w) for w in q]
    query_vector = vectorize(q, vocabulary, idf)
    scores = [[cosine_similarity(query_vector, document_vectors[d]), d] for d in range(len(corpus))]
    scores.sort(key=lambda x: -x[0])
    doc_ids = []
    for i in range(topk):
        doc_ids.append(scores[i][1])

    return doc_ids

In [24]:
#create the vocabulary
vocabulary = list(set([token for doc in corpus_list for token in doc]))
vocabulary.sort()

In [None]:
idf = idf_values(vocabulary, corpus_list)

In [None]:
document_vectors = [vectorize(s, vocabulary, idf) for s in corpus_list]

## Tokenize the Query

In [9]:
# Load the test.csv file in order to tokenize the query 
test = pd.read_csv(os.path.join(DATA_FOLDER, "test.csv"))

In [10]:
test

Unnamed: 0,id,query_id,query,lang
0,0,q-en-0,What organization proposed listing PFOA under ...,en
1,1,q-en-2,What type of coating do ZM1130 - ZM1132 have?,en
2,2,q-en-4,What year did Deutsche Bank sell its stake in ...,en
3,3,q-en-5,Who expressed exasperation when Raphael and Mo...,en
4,4,q-en-7,Who commissioned Amy Beach to compose a choral...,en
...,...,...,...,...
1995,1995,q-ar-1166,ما هو العامل الأكثر فعالية في تعلم اللغة الثان...,ar
1996,1996,q-ar-1169,ما هو موضوع الأبحاث التي أجريت على الطيور المع...,ar
1997,1997,q-ar-1174,ما هي خصائص السلالة الفاقعة (Melospiza melodia...,ar
1998,1998,q-ar-1176,ما هو سلوك الحلول المعادلة الخطية عند المركز ف...,ar


In [12]:
train = pd.read_csv(os.path.join(DATA_FOLDER, "train.csv"))
train

Unnamed: 0,query_id,query,positive_docs,negative_docs,lang
0,q-en-425512,What is the connection between AAA and Lucha U...,doc-en-798457,"['doc-en-810925', 'doc-en-634020', 'doc-en-143...",en
1,q-en-16636,What is the medical use of iloperidone?,doc-en-121692,"['doc-en-177976', 'doc-en-700330', 'doc-en-567...",en
2,q-en-282671,Who was the provisional administrator in 1940?,doc-en-750259,"['doc-en-805362', 'doc-en-413387', 'doc-en-827...",en
3,q-en-216614,What was the critical reception of the film se...,doc-en-703883,"['doc-en-685958', 'doc-en-84060', 'doc-en-2046...",en
4,q-en-156120,What was the main Spanish record of the year i...,doc-en-648393,"['doc-en-4307', 'doc-en-761696', 'doc-en-79426...",en
...,...,...,...,...,...
21870,q-ar-1187,احتفالية تلعب دورًا كبيرًا في تعزيز الترابط ال...,doc-ar-8463,"['doc-ar-5304', 'doc-ar-1977', 'doc-ar-5843', ...",ar
21871,q-ar-1188,ما هو عدد أتباع كنيسة الأدفنتست في جزيرة سان ا...,doc-ar-8469,"['doc-ar-6798', 'doc-ar-1489', 'doc-ar-3100', ...",ar
21872,q-ar-1189,من هو أنتاناس سمتا؟,doc-ar-8476,"['doc-ar-2898', 'doc-ar-6787', 'doc-ar-3235', ...",ar
21873,q-ar-1191,سؤالي هو: ما هي الميزة التي كانت للإيرلنديين ف...,doc-ar-8491,"['doc-ar-786', 'doc-ar-8084', 'doc-ar-3208', '...",ar


In [15]:
stemmer = PorterStemmer() 
def tokenize(text, lang="english"):
    """
    It tokenizes and stems an input text.
    
    :param text: str, with the input text
    :return: list, of the tokenized and stemmed tokens.
    """
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    return [stemmer.stem(word.lower()) for word in tokens if word not in stopwords.words(lang)]

In [30]:
tqdm.pandas()

# Apply the tokenization
test['tokenized'] = test.progress_apply(lambda row: tokenize(row['query'], lang=LANG[row['lang']]), axis=1)

100%|██████████| 2000/2000 [00:08<00:00, 223.69it/s]


In [31]:
test

Unnamed: 0,id,query_id,query,lang,tokenized
0,0,q-en-0,What organization proposed listing PFOA under ...,en,"[what, organ, propos, list, pfoa, stockholm, c..."
1,1,q-en-2,What type of coating do ZM1130 - ZM1132 have?,en,"[what, type, coat, zm1130, zm1132]"
2,2,q-en-4,What year did Deutsche Bank sell its stake in ...,en,"[what, year, deutsch, bank, sell, stake, joint..."
3,3,q-en-5,Who expressed exasperation when Raphael and Mo...,en,"[who, express, exasper, raphael, mona, lisa, s..."
4,4,q-en-7,Who commissioned Amy Beach to compose a choral...,en,"[who, commiss, ami, beach, compos, choral, wor..."
...,...,...,...,...,...
1995,1995,q-ar-1166,ما هو العامل الأكثر فعالية في تعلم اللغة الثان...,ar,"[العامل, الأكثر, فعالية, تعلم, اللغة, الثانية,..."
1996,1996,q-ar-1169,ما هو موضوع الأبحاث التي أجريت على الطيور المع...,ar,"[موضوع, الأبحاث, أجريت, الطيور, المعروفة, بـ, ..."
1997,1997,q-ar-1174,ما هي خصائص السلالة الفاقعة (Melospiza melodia...,ar,"[خصائص, السلالة, الفاقعة, melospiza, melodia, ..."
1998,1998,q-ar-1176,ما هو سلوك الحلول المعادلة الخطية عند المركز ف...,ar,"[سلوك, الحلول, المعادلة, الخطية, المركز, حالة,..."


In [32]:
train['tokenized'] = train.progress_apply(lambda row: tokenize(row['query'], lang=LANG[row['lang']]), axis=1)

100%|██████████| 21875/21875 [00:50<00:00, 430.96it/s]


In [33]:
train

Unnamed: 0,query_id,query,positive_docs,negative_docs,lang,tokenized
0,q-en-425512,What is the connection between AAA and Lucha U...,doc-en-798457,"['doc-en-810925', 'doc-en-634020', 'doc-en-143...",en,"[what, connect, aaa, lucha, underground]"
1,q-en-16636,What is the medical use of iloperidone?,doc-en-121692,"['doc-en-177976', 'doc-en-700330', 'doc-en-567...",en,"[what, medic, use, iloperidon]"
2,q-en-282671,Who was the provisional administrator in 1940?,doc-en-750259,"['doc-en-805362', 'doc-en-413387', 'doc-en-827...",en,"[who, provision, administr, 1940]"
3,q-en-216614,What was the critical reception of the film se...,doc-en-703883,"['doc-en-685958', 'doc-en-84060', 'doc-en-2046...",en,"[what, critic, recept, film, set, brooklyn]"
4,q-en-156120,What was the main Spanish record of the year i...,doc-en-648393,"['doc-en-4307', 'doc-en-761696', 'doc-en-79426...",en,"[what, main, spanish, record, year, 2002]"
...,...,...,...,...,...,...
21870,q-ar-1187,احتفالية تلعب دورًا كبيرًا في تعزيز الترابط ال...,doc-ar-8463,"['doc-ar-5304', 'doc-ar-1977', 'doc-ar-5843', ...",ar,"[احتفالية, تلعب, دورًا, كبيرًا, تعزيز, الترابط..."
21871,q-ar-1188,ما هو عدد أتباع كنيسة الأدفنتست في جزيرة سان ا...,doc-ar-8469,"['doc-ar-6798', 'doc-ar-1489', 'doc-ar-3100', ...",ar,"[عدد, أتباع, كنيسة, الأدفنتست, جزيرة, سان, اند..."
21872,q-ar-1189,من هو أنتاناس سمتا؟,doc-ar-8476,"['doc-ar-2898', 'doc-ar-6787', 'doc-ar-3235', ...",ar,"[أنتاناس, سمتا؟]"
21873,q-ar-1191,سؤالي هو: ما هي الميزة التي كانت للإيرلنديين ف...,doc-ar-8491,"['doc-ar-786', 'doc-ar-8084', 'doc-ar-3208', '...",ar,"[سؤالي, الميزة, كانت, للإيرلنديين, الأوساط, ال..."


## Work On LSI

In [34]:
# Load the Pickles files for corpus tokenized 
corpus_tokenized = load_data("corpus_tokenized.pkl")

MemoryError: 

In [9]:
corpus = pd.read_json(CORPUS)

In [10]:
corpus

Unnamed: 0,docid,text,lang
0,doc-en-9633,"Mars Hill Church was a Christian megachurch, f...",en
1,doc-en-11447,"Joel Chandler Harris (December 9, 1848 – July ...",en
2,doc-en-9696,"Surabaya Dock of 14,000 tons was a floating dr...",en
3,doc-en-4033,This is a list of the cicadas found in Austral...,en
4,doc-en-10997,"John Aaron Rawlins (February 13, 1831 Septemb...",en
...,...,...,...
268017,doc-ar-8463,المسيحية في بيرو هي الديانة السائدة والمهيمنة،...,ar
268018,doc-ar-8469,المسيحية في كولومبيا هي أكبر الديانات في البلا...,ar
268019,doc-ar-8476,تُشكل المسيحية في ليتوانيا أكثر الديانات انتشا...,ar
268020,doc-ar-8491,المسيحية في كندا هي الديانة السائدة إذ وفقاً ل...,ar


In [11]:
corpus['lang'].unique()

array(['en', 'fr', 'de', 'es', 'it', 'ko', 'ar'], dtype=object)

In [12]:
# take only the french language
corpus_french = corpus[corpus['lang'] == 'fr']


In [21]:
corpus_french = corpus_french[:400]

In [19]:
tqdm.pandas()

# Apply the tokenization
corpus_french['tokenized'] = corpus_french.progress_apply(lambda row: tokenize(row['text'], lang=LANG[row['lang']]), axis=1)

100%|██████████| 200/200 [00:56<00:00,  3.52it/s]


In [20]:
corpus_french

Unnamed: 0,docid,text,lang,tokenized
200000,doc-fr-1447,La production de café au Costa Rica représente...,fr,"[la, product, café, costa, rica, représent, 20..."
200001,doc-fr-4878,La continuité du gouvernement (COG) est le pri...,fr,"[la, continuité, gouvern, cog, princip, létabl..."
200002,doc-fr-801,"Juan Manuel Fangio, né le à Balcarce et mort ...",fr,"[juan, manuel, fangio, né, balcarc, mort, buen..."
200003,doc-fr-1750,"Louis Auguste Mathieu Legrand est un peintre, ...",fr,"[loui, august, mathieu, legrand, peintr, dessi..."
200004,doc-fr-5810,"La gare de Lille-Saint-Sauveur, ou simplement ...",fr,"[la, gare, lillesaintsauveur, simplement, «, g..."
...,...,...,...,...
200195,doc-fr-4955,"Le ( / ) a été, à plusieurs reprises, la cham...",fr,"[le, a, plusieur, repris, chambr, haut, parlem..."
200196,doc-fr-4049,"Jean-Paul Agosti, né à Paris en 1948, est un a...",fr,"[jeanpaul, agosti, né, pari, 1948, artist, pei..."
200197,doc-fr-1223,Les Jaguars de Jacksonville sont une équipe pr...,fr,"[le, jaguar, jacksonvil, équip, professionnel,..."
200198,doc-fr-4618,Les filtres à manches sont des filtres employé...,fr,"[le, filtr, manch, filtr, employé, filtrat, in..."


In [23]:
# Assume df is your dataframe and 'text' column contains the document texts.

# Extract the 'text' column as a list of documents
documents = corpus_french['text'].tolist()

documents

['La production de café au Costa Rica représente en 2016 environ 1,2 % de la production mondiale de café, ce qui fait le  grand producteur du monde derrière la Côte d\'Ivoire.\n\nParticularités \nLes grains de café du Costa Rica, sont considérés comme parmi les meilleurs dans le monde. Tarrazú est pensé pour produire la plus désirable des grains de café au Costa Rica. En 2012, le café de Tarrazú est devenu le plus cher des cafés vendus par Starbucks dans 48 de leurs magasins aux États-Unis.\n\nHistoire \n\nL\'histoire de la caféiculture au Costa Rica débute dès la période coloniale puis prend son essor dès les années 1820-1830, quand les autorités coloniales prirent des premières mesures fiscales visant à développer, entre autres, la culture caféière. À l\'indépendance, il avait 17.000 caféiers, permettant l\'exportation de 2 quintaux de café au Panama. Le nouveau gouvernement distribua gratuitement des terres aux personnes qui s\'engageaient à cultiver du café, tandis que les autorité

In [24]:
def create_vocabulary_frequency(corpus, vocab_len):
    '''Select top-k (k = vocab_len) words in term of frequencies as vocabulary'''
    
    count = {} # dictionary that contains the frequency of each word count[word] = freq
    for document in corpus:
        for word in document.split():
            if word in count:
                count[word] += 1
            else:
                count[word] = 1
    
    sorted_count_by_freq = sorted(count.items(), key=lambda kv: kv[1], reverse=True)
    vocabulary = [x[0] for x in sorted_count_by_freq[:vocab_len]]
    
    return vocabulary

In [28]:
vocab_freq = create_vocabulary_frequency(documents, 2000)

In [29]:
vocab_freq

['de',
 'la',
 'et',
 'le',
 'à',
 'en',
 'du',
 'des',
 'les',
 'est',
 ':',
 'un',
 'par',
 'dans',
 'au',
 'une',
 'pour',
 'qui',
 'Le',
 'sur',
 'il',
 'avec',
 'que',
 '||',
 'La',
 'son',
 'a',
 'Il',
 'se',
 'Les',
 ',',
 'plus',
 '«',
 'sont',
 'En',
 'aux',
 'sa',
 '.',
 'ou',
 'deux',
 'comme',
 '-',
 'ce',
 'ses',
 ';',
 "d'un",
 'pas',
 '»',
 'ne',
 'été',
 'mais',
 "d'une",
 '|',
 'cette',
 'fait',
 'aussi',
 'entre',
 'dont',
 'ont',
 'lui',
 'où',
 'premier',
 'leur',
 'alors',
 'même',
 'France',
 'elle',
 '|-',
 'ainsi',
 'sous',
 'première',
 'contre',
 "qu'il",
 'était',
 'également',
 'puis',
 'Grand',
 'y',
 'Au',
 'années',
 'ville',
 'après',
 'trois',
 'Prix',
 'À',
 'fut',
 'depuis',
 'être',
 '».',
 '()',
 'lors',
 'nom',
 'avant',
 'plusieurs',
 'sans',
 'français',
 'Elle',
 'très',
 'tout',
 'ces',
 'partie',
 'vers',
 '»,',
 'peut',
 'of',
 'Dans',
 'place',
 'titre',
 'on',
 'nouveau',
 'The',
 'Notes',
 'fin',
 'Ce',
 'monde',
 'groupe',
 'fois',
 'Jean

In [30]:
len(vocab_freq)

2000

In [33]:
import numpy as np
def construct_term_document_matrix(vocabulary, documents):
    matrix = np.zeros((len(vocabulary), len(documents)))
    for j, document in enumerate(documents):
        counter = Counter(document.split())
        for i, word in enumerate(vocabulary):
                matrix[i,j] = counter[word]
            
    return matrix

In [34]:
term_doc_matrix_freq = construct_term_document_matrix(vocab_freq, documents)

In [35]:
term_doc_matrix_freq


array([[ 48., 287., 291., ..., 273.,  43.,  70.],
       [ 32., 127., 114., ..., 154.,  17.,  58.],
       [ 11.,  92.,  75., ...,  98.,  17.,  37.],
       ...,
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   1.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.]])

In [36]:
term_doc_matrix_freq.shape

(2000, 200)

In [37]:
def truncated_svd(term_doc_matrix, num_val):
    K, S, Dt = np.linalg.svd(term_doc_matrix, full_matrices=False)
    S_matrix = np.diag(S)
    K_sel = K[:, :num_val]
    S_sel = S_matrix[:num_val, :num_val]
    Dt_sel = Dt[:num_val, :]
    
    return K_sel, S_sel, Dt_sel

In [38]:
K_freq, S_freq, Dt_freq = truncated_svd(term_doc_matrix_freq, 100)

In [39]:
K_freq.shape, S_freq.shape, Dt_freq.shape

((2000, 100), (100, 100), (100, 200))

In [40]:
K_freq

array([[-7.04170933e-01,  7.34253435e-03,  1.30854010e-01, ...,
         5.96422228e-03, -1.36285528e-02, -2.18724730e-02],
       [-3.44668867e-01,  2.43117123e-02,  2.41419202e-01, ...,
        -5.76272723e-04, -1.22170381e-02,  3.93317090e-02],
       [-2.51238369e-01,  1.44767769e-02, -8.38934028e-02, ...,
         2.38314862e-02,  3.47301946e-02,  2.37096020e-03],
       ...,
       [-2.39426689e-04,  4.49947768e-05, -1.25326575e-04, ...,
        -6.58745026e-02, -8.25602847e-03, -6.86447770e-02],
       [-5.38783698e-04,  1.48743133e-04,  2.51517027e-04, ...,
         3.71273625e-03,  1.10583385e-03, -5.01403682e-03],
       [-4.01898914e-04,  9.67993059e-05,  9.54770627e-05, ...,
         6.79844727e-03,  8.61844334e-03, -1.10805760e-02]])

In [41]:
query = ['What organization proposed listing PFOA under the Stockholm Convention?']

In [42]:
def query_to_document_vector(query, vocabulary):
    vector = np.zeros(len(vocabulary))
    for word in query:
        try:
            vector[vocabulary.index(word)] += 1
        except:
            # if query word is not in vocabulary, ignore it
            pass
    return vector

In [43]:
def construct_query_vector(query, vocabulary, K_s, S_s, Dt_s):
    q = query_to_document_vector(query, vocabulary)
    S_inv = np.linalg.inv(S_s)

    q_trans = np.dot(np.dot(q, K_s), S_inv)
    
    return q_trans

In [44]:
query_vector_freq = construct_query_vector(query, vocab_freq, K_freq, S_freq, Dt_freq)

In [45]:
query_vector_freq

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [46]:
def cosine_similarity(v1, v2):
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy*1.0/math.sqrt(sumxx*sumyy)

In [47]:
def retrieve_documents(query_vector, top_k, Dt_sel):
    scores = [[cosine_similarity(query_vector, Dt_sel[:,d]), d] for d in range(len(documents))]
    scores.sort(key=lambda x: -x[0])
    doc_ids = []
    retrieved = []
    for i in range(top_k):
        doc_ids.append(scores[i][1])
        retrieved.append(orig_docs[scores[i][1]])
    return doc_ids, retrieved

In [48]:
retrieved_ids_freq, retrieved_docs_freq = retrieve_documents(query_vector_freq, 10, Dt_freq)
print(retrieved_docs_freq)

  return sumxy*1.0/math.sqrt(sumxx*sumyy)


NameError: name 'orig_docs' is not defined