## Text

In [2]:
import os
import numpy as np
import re

In [11]:
class BOW():
    def __init__(self, k_val, b_val):
        # Constants
        self.k_val = k_val
        self.b_val = b_val

        # Vocabulary contains the words in the corpus and the number of documents they appear in
        self.vocabulary = {}
        self.vocabulary_arr = None
        self.vocab_size = None
        
        # Documents contains the words in each document and the number of times they appear
        self.documents = {}
        self.document_vectors = None
        self.doc_count = None
        self.max_docs = None
        self.avg_doc_length = None

        # score matrices
        self.tf_matrix = None
        self.idf_matrix = None
    
    def create(self, path_to_annotations):

        self.doc_count = 0
        for (root, dirs, files) in os.walk(path_to_annotations, topdown=True):
            print(root)
            # print(dirs)
            # print(files)
            for file in files:
                file_num = file.split('.')[0]
                file_num = int(file_num)
                # parses the annotation file and adds the words to the vocabulary
                self.parse_annotation_file(os.path.join(root, file), file_num)
                self.doc_count += 1

        self.vocab_size = len(self.vocabulary)

        self.vocabulary_arr = np.array(list(self.vocabulary))

        # make document vectors
        self.max_docs = max(self.documents.keys())+1
        self.document_vectors = np.zeros((self.max_docs, self.vocab_size))
        
        for doc_num in self.documents.keys():
            for word in self.documents[doc_num]:
                word_index = np.where(self.vocabulary_arr == word)[0][0]
                self.document_vectors[doc_num][word_index] = self.documents[doc_num][word]

        self.avg_doc_length = np.sum(self.document_vectors) / self.doc_count

    def form_matrix(self):
        if(self.vocabulary_arr is None):
            print("vocabulary is none")
            print("run create() first")

        # calculate idf
        values = self.vocabulary.values()
        values = np.array(list(values))
        self.idf_matrix = np.log((self.doc_count + 1) / (values + 0.5))

        # calculate tf
        self.tf_matrix = self.k_val*self.document_vectors / (self.k_val * (1 - self.b_val + (self.b_val * (np.sum(self.document_vectors,axis=1).T/ self.avg_doc_length))) + self.document_vectors.T).T   

        return self.tf_matrix, self.idf_matrix  

    def get_query_score(self, query_num, doc_num):
        if(self.idf_matrix is None):
            print("idf matrix is none")
            print("run form_matrix() first")
        if(self.tf_matrix is None):
            print("tf matrix is none")
            print("run form_matrix() first")

        # vector = self.get_query_vector(path_to_query)
        # tf_q = self.get_query_tf(vector)
        tf_q = self.tf_matrix[query_num]
        tf_d = self.tf_matrix[doc_num]
        idf = self.idf_matrix
        return np.sum(tf_q*tf_d*idf*idf)

    def get_query_tf(self, query_vector):
        return self.k_val*query_vector / (self.k_val * (1 - self.b_val + self.b_val * (np.sum(query_vector).T/ self.avg_doc_length)) + query_vector.T).T

    def get_query_vector(self, path_to_query):
        query_tf_vector = np.zeros(self.vocab_size)
        with open(path_to_query, 'r', encoding='latin-1') as f:
            # read the doc as a string
            doc = f.read()
            
            # get the words in the doc
            words_in_doc = self.get_words_in_doc(doc)

            for word in words_in_doc:
                word_index = np.where(self.vocabulary_arr == word)[0][0]
                query_tf_vector[word_index] = words_in_doc[word]
            
        return query_tf_vector

    def parse_annotation_file(self, path_to_annotation_file, file_number):
        # open in latin-1 encoding to avoid UnicodeDecodeError
        with open(path_to_annotation_file, 'r', encoding='latin-1') as f:
            # read the doc as a string
            doc = f.read()
            
            # get the words in the doc
            words_in_doc = self.get_words_in_doc(doc)
            
            # add the words in the document to the vocabulary
            self.documents[file_number] = words_in_doc

            for word in words_in_doc:
                if word in self.vocabulary:
                    self.vocabulary[word] += 1
                else:
                    self.vocabulary[word] = 1

    def get_words_in_doc(self, doc):

        # replace \n with space
        doc = doc.replace('\n', ' ')
        # get parts between all <TITLE> and </TITLE> tags
        titles = re.findall(r'<TITLE>(.*?)</TITLE>', doc)
        # get parts between all <DESCRIPTION> and </DESCRIPTION> tags
        descriptions = re.findall(r'<DESCRIPTION>(.*?)</DESCRIPTION>', doc)
        # get parts between all <NOTES> and </NOTES> tags
        notes = re.findall(r'<NOTES>(.*?)</NOTES>', doc)
        # get parts between all <LOCATION> and </LOCATION> tags
        locations = re.findall(r'<LOCATION>(.*?)</LOCATION>', doc)

        words_in_doc = {}

        # split into words and add to vocabulary
        for title in titles:
            for word in title.split():
                word = word.strip(' .,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1

        for description in descriptions:
            for word in description.split():
                word = word.strip('.,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1

        for note in notes:
            for word in note.split():
                word = word.strip('.,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1

        for location in locations:
            for word in location.split():
                word = word.strip('.,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1

        return words_in_doc

In [12]:
path_to_annotations = './Smol_set/annotations_complete_eng/'
k_val = 1.5
b_val = 0.75
bow = BOW(k_val, b_val)
bow.create(path_to_annotations)

./Smol_set/annotations_complete_eng/
./Smol_set/annotations_complete_eng/00
./Smol_set/annotations_complete_eng/01
./Smol_set/annotations_complete_eng/02
./Smol_set/annotations_complete_eng/03
./Smol_set/annotations_complete_eng/04
./Smol_set/annotations_complete_eng/05


In [13]:
tf_matrix, idf_matrix = bow.form_matrix()

In [14]:
sample = bow.get_query_score(2095,4323)

print(sample)

4.9214412247644646


In [20]:
file_1 = 4323
file_2 = 2095

out = bow.get_query_score(file_1,file_2)

path_1 = f'./Smol_set/annotations_complete_eng/0{str(file_1)[0]}/{file_1}.eng'
path_2 = f'./Smol_set/annotations_complete_eng/0{str(file_2)[0]}/{file_2}.eng'

print("-------------------------------")
print("file 1")
with open(path_1, 'r', encoding='latin-1') as f:
    doc_1 = f.read()
    print(doc_1)

print("-------------------------------")
print("file 2")
with open(path_2, 'r', encoding='latin-1') as f:
    doc_2 = f.read()
    print(doc_2)

print("-------------------------------")
print("score: ", out)

-------------------------------
file 1
<DOC>
<DOCNO>annotations/04/4323.eng</DOCNO>
<TITLE>Photo of the host family Schuler</TITLE>
<DESCRIPTION>two men and a woman are sitting at a table and a dog is sitting next to it; there are flower pots and a black book on the table, and white flowers and a wall behind it;</DESCRIPTION>
<NOTES></NOTES>
<LOCATION>Buenos Aires, Argentina</LOCATION>
<DATE>May 2002</DATE>
<IMAGE>images/04/4323.jpg</IMAGE>
<THUMBNAIL>thumbnails/04/4323.jpg</THUMBNAIL>
</DOC>
-------------------------------
file 2
<DOC>
<DOCNO>annotations/02/2095.eng</DOCNO>
<TITLE>The train from Cusco to Puno in the Altiplano</TITLE>
<DESCRIPTION>an orange and yellow train on a pass; there are snow covered mountains on the left and in the background, and some tourists stretching their legs on the right;</DESCRIPTION>
<NOTES>The train from Cuzco to Puno stops at the La Raya Pass; </NOTES>
<LOCATION>Cuzco, Peru</LOCATION>
<DATE>August 2002</DATE>
<IMAGE>images/02/2095.jpg</IMAGE>
<THUMB

In [21]:
file_1 = 3191
file_2 = 3193

out = bow.get_query_score(file_1,file_2)

path_1 = f'./Smol_set/annotations_complete_eng/0{str(file_1)[0]}/{file_1}.eng'
path_2 = f'./Smol_set/annotations_complete_eng/0{str(file_2)[0]}/{file_2}.eng'

print("-------------------------------")
print("file 1")
with open(path_1, 'r', encoding='latin-1') as f:
    doc_1 = f.read()
    print(doc_1)

print("-------------------------------")
print("file 2")
with open(path_2, 'r', encoding='latin-1') as f:
    doc_2 = f.read()
    print(doc_2)

print("-------------------------------")
print("score: ", out)

-------------------------------
file 1
<DOC>
<DOCNO>annotations/03/3191.eng</DOCNO>
<TITLE>Anaconda</TITLE>
<DESCRIPTION>an anaconda on a tree trunk in an aquarium;</DESCRIPTION>
<NOTES>Scientific name: Eunectes</NOTES>
<LOCATION>São Paulo, Brazil</LOCATION>
<DATE>February 2002</DATE>
<IMAGE>images/03/3191.jpg</IMAGE>
<THUMBNAIL>thumbnails/03/3191.jpg</THUMBNAIL>
</DOC>
-------------------------------
file 2
<DOC>
<DOCNO>annotations/03/3193.eng</DOCNO>
<TITLE>Anacondas</TITLE>
<DESCRIPTION>close-up photo of two anacondas in an aquarium;</DESCRIPTION>
<NOTES>Scientific name: Eunectes;</NOTES>
<LOCATION>São Paulo, Brazil</LOCATION>
<DATE>February 2002</DATE>
<IMAGE>images/03/3193.jpg</IMAGE>
<THUMBNAIL>thumbnails/03/3193.jpg</THUMBNAIL>
</DOC>
-------------------------------
score:  97.8458485051738


In [22]:
file_1 = 5152
file_2 = 5155

out = bow.get_query_score(file_1,file_2)

path_1 = f'./Smol_set/annotations_complete_eng/0{str(file_1)[0]}/{file_1}.eng'
path_2 = f'./Smol_set/annotations_complete_eng/0{str(file_2)[0]}/{file_2}.eng'

print("-------------------------------")
print("file 1")
with open(path_1, 'r', encoding='latin-1') as f:
    doc_1 = f.read()
    print(doc_1)

print("-------------------------------")
print("file 2")
with open(path_2, 'r', encoding='latin-1') as f:
    doc_2 = f.read()
    print(doc_2)

print("-------------------------------")
print("score: ", out)

-------------------------------
file 1
<DOC>
<DOCNO>annotations/05/5152.eng</DOCNO>
<TITLE>The mountains around Chivay</TITLE>
<DESCRIPTION>a mountain landscape with high, steep, bald mountains and a snow covered peak in the background;</DESCRIPTION>
<NOTES></NOTES>
<LOCATION>Chivay, Peru</LOCATION>
<DATE>September 2002</DATE>
<IMAGE>images/05/5152.jpg</IMAGE>
<THUMBNAIL>thumbnails/05/5152.jpg</THUMBNAIL>
</DOC>
-------------------------------
file 2
<DOC>
<DOCNO>annotations/05/5155.eng</DOCNO>
<TITLE>A condor flying</TITLE>
<DESCRIPTION>a flying condor with a bald mountain range in the background;</DESCRIPTION>
<NOTES></NOTES>
<LOCATION>Cabanaconde, Peru</LOCATION>
<DATE>September 2002</DATE>
<IMAGE>images/05/5155.jpg</IMAGE>
<THUMBNAIL>thumbnails/05/5155.jpg</THUMBNAIL>
</DOC>
-------------------------------
score:  8.419743058659392


In [23]:
file_1 = 4002
file_2 = 4005

out = bow.get_query_score(file_1,file_2)

path_1 = f'./Smol_set/annotations_complete_eng/0{str(file_1)[0]}/{file_1}.eng'
path_2 = f'./Smol_set/annotations_complete_eng/0{str(file_2)[0]}/{file_2}.eng'

print("-------------------------------")
print("file 1")
with open(path_1, 'r', encoding='latin-1') as f:
    doc_1 = f.read()
    print(doc_1)

print("-------------------------------")
print("file 2")
with open(path_2, 'r', encoding='latin-1') as f:
    doc_2 = f.read()
    print(doc_2)

print("-------------------------------")
print("score: ", out)

-------------------------------
file 1
<DOC>
<DOCNO>annotations/04/4002.eng</DOCNO>
<TITLE>At the beach</TITLE>
<DESCRIPTION>a sandy beach with a few people sitting in the shade of a palm tree; a wooded slope with houses in the background;</DESCRIPTION>
<NOTES></NOTES>
<LOCATION>Tobago, Trinidad</LOCATION>
<DATE>February 2002</DATE>
<IMAGE>images/04/4002.jpg</IMAGE>
<THUMBNAIL>thumbnails/04/4002.jpg</THUMBNAIL>
</DOC>
-------------------------------
file 2
<DOC>
<DOCNO>annotations/04/4005.eng</DOCNO>
<TITLE>A wave breaking at the beach</TITLE>
<DESCRIPTION>a boy in front of a breaking wave at a sandy beach; two boats at sea in the background;</DESCRIPTION>
<NOTES></NOTES>
<LOCATION>Tobago, Trinidad</LOCATION>
<DATE>February 2002</DATE>
<IMAGE>images/04/4005.jpg</IMAGE>
<THUMBNAIL>thumbnails/04/4005.jpg</THUMBNAIL>
</DOC>
-------------------------------
score:  28.095881694416732
