In [7]:
import os
import numpy as np
import re

In [8]:


def parse_annotation_file(path_to_annotation_file, file_number, vocabulary, documents):
    # open in latin-1 encoding to avoid UnicodeDecodeError
    with open(path_to_annotation_file, 'r', encoding='latin-1') as f:
        # read the doc as a string
        doc = f.read()
        # replace \n with space
        doc = doc.replace('\n', ' ')
        # get parts between all <TITLE> and </TITLE> tags
        titles = re.findall(r'<TITLE>(.*?)</TITLE>', doc)
        # get parts between all <DESCRIPTION> and </DESCRIPTION> tags
        descriptions = re.findall(r'<DESCRIPTION>(.*?)</DESCRIPTION>', doc)
        # get parts between all <NOTES> and </NOTES> tags
        notes = re.findall(r'<NOTES>(.*?)</NOTES>', doc)
        # get parts between all <LOCATION> and </LOCATION> tags
        locations = re.findall(r'<LOCATION>(.*?)</LOCATION>', doc)

        words_in_doc = {}

        # split into words and add to vocabulary
        for title in titles:
            for word in title.split():
                word = word.strip(' .,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1
                # words_in_doc.add(word)
                # vocabulary.add(word)
        for description in descriptions:
            for word in description.split():
                word = word.strip('.,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1
                # words_in_doc.add(word)
                # vocabulary.add(word)
        for note in notes:
            for word in note.split():
                word = word.strip('.,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1
                # words_in_doc.add(word)
                # vocabulary.add(word)
        for location in locations:
            for word in location.split():
                word = word.strip('.,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1
                # words_in_doc.add(word)
                # vocabulary.add(word)

        # add the words in the document to the vocabulary
        # for word in words_in_doc:
        #     if word in vocabulary:
        #         vocabulary[word].add(file_number)
        #     else:
        #         vocabulary[word] = set([file_number])
        documents[file_number] = words_in_doc

        for word in words_in_doc:
            if word in vocabulary:
                vocabulary[word] += 1
            else:
                vocabulary[word] = 1
    return


def form_vocab(path_to_annotations):
    vocabulary = {}
    documents = {}
    doc_count = 0

    for (root, dirs, files) in os.walk(path_to_annotations, topdown=True):
        print(root)
        # print(dirs)
        # print(files)
        for file in files:
            file_num = file.split('.')[0]
            # parses the annotation file and adds the words to the vocabulary
            parse_annotation_file(os.path.join(root, file), file_num, vocabulary, documents)
            doc_count += 1
            # print(file)

    # # print('vocabulary: ', vocabulary)
    # with open('vocabulary.txt', 'w') as f:
    #     for word in vocabulary:
    #         f.write(word + '\t:\t' + str(vocabulary[word]) + '\n')

    vocab_size = len(vocabulary)
    # print('vocab_size: ', vocab_size)

    # convert the vocabulary to a numpy array
    vocabulary_arr = np.array(list(vocabulary))
    # print('vocabulary: ', vocabulary_arr)

    # print('number of documents: ', doc_count)
    return vocabulary, vocabulary_arr, vocab_size, doc_count, documents
# print(documents['4323'])


def form_matrix(vocabulary, vocabulary_arr, vocab_size, doc_count, documents, max_docs, k_val, b_val):
    # make document vectors
    document_vectors = np.zeros((max_docs, vocab_size))

    for doc_num in documents:
        for word in documents[doc_num]:
            word_index = np.where(vocabulary_arr == word)[0][0]
            document_vectors[int(
                doc_num)][word_index] = documents[doc_num][word]

    # calculate average document length
    avg_doc_length = np.sum(document_vectors) / doc_count
    print('avg_doc_length: ', avg_doc_length)

    with open('document_vectors.txt', 'w') as f:
        for i in range(doc_count):
            f.write(str(i) + '\t:\t' + str(document_vectors[i]) + '\n')

    # print the 4323th document vector
    for i in range(vocab_size):
        if document_vectors[4323][i] != 0:
            print(vocabulary_arr[i] + " : " + str(document_vectors[4323][i]))
    
    print("check : " + str(np.sum(document_vectors[4323])))

    # calculate idf matrix
    idf_matrix = np.zeros((vocab_size, 1))
    for i in range(vocab_size):
        idf_matrix[i] = np.log(
            (doc_count + 1) / (vocabulary[vocabulary_arr[i]] + 0.5))

    # print('idf_matrix: ', idf_matrix)

    # tf matrix
    # tf_matrix = document_vectors     # temporary, will replace with okapi tf
    # k_val = 1.5
    # b_val = 0.75
    tf_matrix = np.zeros((max_docs, vocab_size))
    for i in range(max_docs):
        d_i = np.sum(document_vectors[i])
        for j in range(vocab_size):
            tf_matrix[i][j] = (k_val) * document_vectors[i][j] / (k_val * (1 - b_val + (b_val * ((d_i) / avg_doc_length))) + document_vectors[i][j])
    return tf_matrix, idf_matrix


def query_tf(path_to_query, vocabulary_arr, vocab_size):
    query_tf_vector = np.zeros(vocab_size)
    with open(path_to_query, 'r', encoding='latin-1') as f:
        # read the doc as a string
        doc = f.read()
        # replace \n with space
        doc = doc.replace('\n', ' ')
        # get parts between all <TITLE> and </TITLE> tags
        titles = re.findall(r'<TITLE>(.*?)</TITLE>', doc)
        # get parts between all <DESCRIPTION> and </DESCRIPTION> tags
        descriptions = re.findall(r'<DESCRIPTION>(.*?)</DESCRIPTION>', doc)
        # get parts between all <NOTES> and </NOTES> tags
        notes = re.findall(r'<NOTES>(.*?)</NOTES>', doc)
        # get parts between all <LOCATION> and </LOCATION> tags
        locations = re.findall(r'<LOCATION>(.*?)</LOCATION>', doc)

        words_in_doc = {}    
        for title in titles:
            for word in title.split():
                word = word.strip(' .,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1
                # words_in_doc.add(word)
                # vocabulary.add(word)
        for description in descriptions:
            for word in description.split():
                word = word.strip('.,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1
                # words_in_doc.add(word)
                # vocabulary.add(word)
        for note in notes:
            for word in note.split():
                word = word.strip('.,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1
                # words_in_doc.add(word)
                # vocabulary.add(word)
        for location in locations:
            for word in location.split():
                word = word.strip('.,;:!?()[]\{\}\'\"')
                word = word.lower()
                if word not in words_in_doc:
                    words_in_doc[word] = 0
                words_in_doc[word] += 1

        for word in words_in_doc:
            word_index = np.where(vocabulary_arr == word)[0][0]
            query_tf_vector[word_index] = words_in_doc[word]
        
    return query_tf_vector
            




In [9]:
path_to_annotations = '../Smol_set/annotations_complete_eng/'
max_docs = 6000
k_val = 1.5
b_val = 0.75

vocabulary, vocabulary_arr, vocab_size, doc_count, documents = form_vocab(path_to_annotations)
tf_matrix, idf_matrix = form_matrix(vocabulary, vocabulary_arr, vocab_size, doc_count, documents, max_docs, k_val, b_val)

with open('vocabulary.txt', 'w') as f:
    for word in vocabulary:
        f.write(word + '\t:\t' + str(vocabulary[word]) + '\n')

print('vocab_size: ', vocab_size)
print('vocabulary: ', vocabulary_arr)
print('number of documents: ', doc_count)
print(idf_matrix)
print(tf_matrix)
print(tf_matrix.shape)
print(np.sum(tf_matrix))

sample = query_tf('../Smol_set/annotations_complete_eng/04/4323.eng', vocabulary_arr, vocab_size)

print(np.sum(sample))



../Smol_set/annotations_complete_eng/
../Smol_set/annotations_complete_eng/01
../Smol_set/annotations_complete_eng/05
../Smol_set/annotations_complete_eng/04
../Smol_set/annotations_complete_eng/00
../Smol_set/annotations_complete_eng/03
../Smol_set/annotations_complete_eng/02
avg_doc_length:  29.707408800300865
a : 5.0
two : 1.0
and : 5.0
the : 2.0
there : 1.0
are : 2.0
on : 1.0
of : 1.0
table : 2.0
at : 1.0
photo : 1.0
white : 1.0
is : 1.0
behind : 1.0
it : 2.0
wall : 1.0
argentina : 1.0
to : 1.0
black : 1.0
flowers : 1.0
next : 1.0
flower : 1.0
pots : 1.0
buenos : 1.0
aires : 1.0
sitting : 2.0
woman : 1.0
men : 1.0
host : 1.0
family : 1.0
book : 1.0
schuler : 1.0
dog : 1.0
check : 46.0
vocab_size:  4155
vocabulary:  ['sheraton' '-' 'exterior' ... 'self-built' 'steeply' 'orange-brown']
number of documents:  2659
[[7.48061629]
 [1.93935275]
 [2.46776124]
 ...
 [7.48061629]
 [7.48061629]
 [7.48061629]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0.

In [10]:
# path_to_annotations = '../Smol_set/annotations_complete_eng/'
# vocabulary = set()

# def parse_annotation_file(path_to_annotation_file):
#     # open in latin-1 encoding to avoid UnicodeDecodeError
#     with open(path_to_annotation_file, 'r', encoding='latin-1') as f:
#         # read the doc as a string
#         doc = f.read()
#         # replace \n with space
#         doc = doc.replace('\n', ' ')
#         # get parts between all <TITLE> and </TITLE> tags
#         titles = re.findall(r'<TITLE>(.*?)</TITLE>', doc)
#         # get parts between all <DESCRIPTION> and </DESCRIPTION> tags
#         descriptions = re.findall(r'<DESCRIPTION>(.*?)</DESCRIPTION>', doc)
#         # get parts between all <NOTES> and </NOTES> tags
#         notes = re.findall(r'<NOTES>(.*?)</NOTES>', doc)
#         # get parts between all <LOCATION> and </LOCATION> tags
#         locations = re.findall(r'<LOCATION>(.*?)</LOCATION>', doc)

#         # split into words and add to vocabulary
#         for title in titles:
#             for word in title.split():
#                 word = word.strip(' .,;:!?()[]\{\}\'\"')
#                 word = word.lower()
#                 vocabulary.add(word)
#         for description in descriptions:
#             for word in description.split():
#                 word = word.strip('.,;:!?()[]\{\}\'\"')
#                 word = word.lower()
#                 vocabulary.add(word)
#         for note in notes:
#             for word in note.split():
#                 word = word.strip('.,;:!?()[]\{\}\'\"')
#                 word = word.lower()
#                 vocabulary.add(word)
#         for location in locations:
#             for word in location.split():
#                 word = word.strip('.,;:!?()[]\{\}\'\"')
#                 word = word.lower()
#                 vocabulary.add(word)

#         # print('titles: ', titles)
#         # print('descriptions: ', descriptions)
#         # print('notes: ', notes)
#         # print('locations: ', locations)

#     return 

# for (root, dirs, files) in os.walk(path_to_annotations, topdown=True):
#     print(root)
#     # print(dirs)
#     # print(files)
#     for file in files:
#         # parses the annotation file and adds the words to the vocabulary
#         parse_annotation_file(os.path.join(root, file))

# # print('vocabulary: ', vocabulary)
# with open('vocabulary.txt', 'w') as f:
#     for word in vocabulary:
#         f.write(word + '\n')

# vocab_size = len(vocabulary)
# print('vocab_size: ', vocab_size)

# # convert the vocabulary to a numpy array
# vocabulary = np.array(list(vocabulary))
# print('vocabulary: ', vocabulary)

# # check if we can find the word 'the' in the vocabulary
# # print('the' in vocabulary)


In [11]:
# test = {'a': 1, 'b': 2, 'c': 3}

# if (test.get('d')):
#     print("yes")
# else:
#     print("no")

# print(test.get('d'))