# Tokenizing

In [1]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords
import re

In [2]:
def stem_text(text, ps):
    stemmed = ' '.join([ps.stem(word) for word in text.split()])
    return stemmed

In [3]:
text_map = {}
ps = PorterStemmer()

def parse_file(file_path):
    current_doc_no = None
    reading_text = False
    doc_text = ""
    
    # parse the doc to get doc no and corresponding text
    with open(file_path, 'r', encoding='ISO-8859-1', errors='ignore') as file:
        lines = file.readlines()

    for line in lines:
        if "<DOCNO>" in line:
            current_doc_no = line.strip().replace('<DOCNO>', '').replace('</DOCNO>', '')
        elif "<TEXT>" in line:
            reading_text = True
        elif "</TEXT>" in line:
            reading_text = False
        elif reading_text:
            doc_text += line.strip() + ' '
        elif "</DOC>" in line:
            if current_doc_no is not None:
                text_map[current_doc_no.strip()] = doc_text.strip()
                doc_text = "" 

In [4]:
folder = "../IR_data/AP_DATA/ap89_collection"

for filename in os.listdir(folder):
    if filename != 'readme':
        file_path = os.path.join(folder, filename)
        parse_file(file_path)
        
print("Parsing completed")

Parsing completed


In [5]:
#total no of docs
print(len(text_map))

84678


In [6]:
print(text_map["AP890109-0262"])

Today is Friday, Jan. 20, the 20th day of 1989. There are 345 days left in the year. Today's highlight in history: On Jan. 20, 1981, Iran released the 52 Americans it had held hostage for 444 days, minutes after the presidency passed from Jimmy Carter to Ronald Reagan. On this date: In 1265, England's Parliament, representing districts, cities and boroughs, met for the first time. In 1801, John Marshall was appointed U.S. chief justice. In 1887, the U.S. Senate approved an agreement to lease Pearl Harbor in Hawaii as a naval base. In 1936, Britain's King George V died. He was succeeded by Edward VIII. In 1937, President Franklin D. Roosevelt became the first chief executive to be inaugurated on Jan. 20 instead of March 4, because of the 20th Amendment to the Constitution. In 1942, Nazi officials held the notorious Wannsee conference in Berlin, at which they decided on their ``final solution'' calling for the extermination of Europe's Jews. In 1945, President Franklin D. Roosevelt was s

In [7]:
from collections import defaultdict

TOKEN_REGEX = r'\b[a-zA-Z0-9.]+\b'

def tokenize_text_map(text_map):
    tokenized_dict_map = {}

    for doc_id, document in text_map.items():
        tokens = re.findall(TOKEN_REGEX, document.lower())
        tokenized_text = [token for token in tokens]
        tokenized_dict_map[doc_id] = tokenized_text

    return tokenized_dict_map

In [8]:
tokenized_dict_map=tokenize_text_map(text_map)

In [9]:
print("Tokenized Text Map:")
print(tokenized_dict_map["AP890109-0262"])

Tokenized Text Map:
['today', 'is', 'friday', 'jan', '20', 'the', '20th', 'day', 'of', '1989', 'there', 'are', '345', 'days', 'left', 'in', 'the', 'year', 'today', 's', 'highlight', 'in', 'history', 'on', 'jan', '20', '1981', 'iran', 'released', 'the', '52', 'americans', 'it', 'had', 'held', 'hostage', 'for', '444', 'days', 'minutes', 'after', 'the', 'presidency', 'passed', 'from', 'jimmy', 'carter', 'to', 'ronald', 'reagan', 'on', 'this', 'date', 'in', '1265', 'england', 's', 'parliament', 'representing', 'districts', 'cities', 'and', 'boroughs', 'met', 'for', 'the', 'first', 'time', 'in', '1801', 'john', 'marshall', 'was', 'appointed', 'u.s', 'chief', 'justice', 'in', '1887', 'the', 'u.s', 'senate', 'approved', 'an', 'agreement', 'to', 'lease', 'pearl', 'harbor', 'in', 'hawaii', 'as', 'a', 'naval', 'base', 'in', '1936', 'britain', 's', 'king', 'george', 'v', 'died', 'he', 'was', 'succeeded', 'by', 'edward', 'viii', 'in', '1937', 'president', 'franklin', 'd', 'roosevelt', 'became', 't

In [10]:
sw_path = "../config/stoplistnltk.txt"

with open(sw_path) as file:
    stopwords = file.read().splitlines()

print(len(stopwords))

418


In [11]:
import string

def process_content(text):
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in stopwords])
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

In [12]:
def stem_text(text, ps):
    stemmed = ' '.join([ps.stem(word) for word in text.split()])
    return stemmed

In [13]:
tokenized_dict_nonstem_map = {}

for doc_id, document in text_map.items():
    processed_text = process_content(document)
    tokens = processed_text.split()
    tokenized_dict_nonstem_map[doc_id] = tokens

In [14]:
print("Tokenized non-stem map:")
print(tokenized_dict_nonstem_map["AP890109-0262"])

Tokenized non-stem map:
['today', 'friday', 'jan', '20', '20th', '1989', '345', 'days', 'left', 'year', 'todays', 'highlight', 'history', 'jan', '20', '1981', 'iran', 'released', '52', 'americans', 'held', 'hostage', '444', 'days', 'minutes', 'presidency', 'passed', 'jimmy', 'carter', 'ronald', 'reagan', 'date', '1265', 'englands', 'parliament', 'representing', 'districts', 'cities', 'boroughs', 'met', 'time', '1801', 'john', 'marshall', 'appointed', 'us', 'chief', 'justice', '1887', 'us', 'senate', 'approved', 'agreement', 'lease', 'pearl', 'harbor', 'hawaii', 'naval', 'base', '1936', 'britains', 'king', 'george', 'v', 'died', 'succeeded', 'edward', 'viii', '1937', 'president', 'franklin', 'd', 'roosevelt', 'chief', 'executive', 'inaugurated', 'jan', '20', 'march', '4', '20th', 'amendment', 'constitution', '1942', 'nazi', 'officials', 'held', 'notorious', 'wannsee', 'conference', 'berlin', 'decided', 'final', 'solution', 'calling', 'extermination', 'europes', 'jews', '1945', 'presiden

In [15]:
tokenized_dict_stemmed_map = {}

for doc_id, nonstemmed_tokens in tokenized_dict_nonstem_map.items():
    stemmed_tokens = [stem_text(token, ps) for token in nonstemmed_tokens]
    tokenized_dict_stemmed_map[doc_id] = stemmed_tokens

In [16]:
print("Tokenized stemmed map:")
print(tokenized_dict_stemmed_map["AP890109-0262"])

Tokenized stemmed map:
['today', 'friday', 'jan', '20', '20th', '1989', '345', 'day', 'left', 'year', 'today', 'highlight', 'histori', 'jan', '20', '1981', 'iran', 'releas', '52', 'american', 'held', 'hostag', '444', 'day', 'minut', 'presid', 'pass', 'jimmi', 'carter', 'ronald', 'reagan', 'date', '1265', 'england', 'parliament', 'repres', 'district', 'citi', 'borough', 'met', 'time', '1801', 'john', 'marshal', 'appoint', 'us', 'chief', 'justic', '1887', 'us', 'senat', 'approv', 'agreement', 'leas', 'pearl', 'harbor', 'hawaii', 'naval', 'base', '1936', 'britain', 'king', 'georg', 'v', 'die', 'succeed', 'edward', 'viii', '1937', 'presid', 'franklin', 'd', 'roosevelt', 'chief', 'execut', 'inaugur', 'jan', '20', 'march', '4', '20th', 'amend', 'constitut', '1942', 'nazi', 'offici', 'held', 'notori', 'wannse', 'confer', 'berlin', 'decid', 'final', 'solut', 'call', 'extermin', 'europ', 'jew', '1945', 'presid', 'franklin', 'd', 'roosevelt', 'sworn', 'offic', 'unpreced', 'fourth', 'term', '1977

In [17]:
tokenized_dict_stemmed_map_data = {}

for doc_id, stemmed_tokens in tokenized_dict_stemmed_map.items():
    token_positions = {}
    for position, token in enumerate(stemmed_tokens, start=1):
        if token not in token_positions:
            token_positions[token] = []
        token_positions[token].append(position)
    tokenized_dict_stemmed_map_data[doc_id] = token_positions

In [18]:
print("Tokenized stemmed map with term frequency:")
print(tokenized_dict_stemmed_map_data["AP890109-0262"])

Tokenized stemmed map with term frequency:
{'today': [1, 11, 221, 263], 'friday': [2], 'jan': [3, 14, 77], '20': [4, 15, 78], '20th': [5, 81], '1989': [6], '345': [7], 'day': [8, 24], 'left': [9], 'year': [10, 159, 179, 268], 'highlight': [12], 'histori': [13], '1981': [16], 'iran': [17, 173], 'releas': [18, 155], '52': [19, 257], 'american': [20], 'held': [21, 87], 'hostag': [22, 157], '444': [23], 'minut': [25], 'presid': [26, 70, 100, 115], 'pass': [27], 'jimmi': [28, 110], 'carter': [29, 111], 'ronald': [30], 'reagan': [31], 'date': [32], '1265': [33], 'england': [34], 'parliament': [35], 'repres': [36], 'district': [37], 'citi': [38], 'borough': [39], 'met': [40], 'time': [41, 265], '1801': [42], 'john': [43], 'marshal': [44], 'appoint': [45], 'us': [46, 50, 114], 'chief': [47, 74], 'justic': [48], '1887': [49], 'senat': [51], 'approv': [52], 'agreement': [53], 'leas': [54], 'pearl': [55], 'harbor': [56], 'hawaii': [57], 'naval': [58], 'base': [59], '1936': [60], 'britain': [61], 

In [19]:
tokenized_dict_nonstem_map_data = {}

for doc_id, stemmed_tokens in tokenized_dict_nonstem_map.items():
    token_positions = {}
    for position, token in enumerate(stemmed_tokens, start=1):
        if token not in token_positions:
            token_positions[token] = []
        token_positions[token].append(position)
    tokenized_dict_nonstem_map_data[doc_id] = token_positions

In [20]:
print("Tokenized non-stem map with term frequency:")
print(tokenized_dict_nonstem_map_data["AP890109-0262"])

Tokenized non-stem map with term frequency:
{'today': [1, 263], 'friday': [2], 'jan': [3, 14, 77], '20': [4, 15, 78], '20th': [5, 81], '1989': [6], '345': [7], 'days': [8, 24], 'left': [9], 'year': [10], 'todays': [11, 221], 'highlight': [12], 'history': [13], '1981': [16], 'iran': [17, 173], 'released': [18], '52': [19, 257], 'americans': [20], 'held': [21, 87], 'hostage': [22], '444': [23], 'minutes': [25], 'presidency': [26], 'passed': [27], 'jimmy': [28, 110], 'carter': [29, 111], 'ronald': [30], 'reagan': [31], 'date': [32], '1265': [33], 'englands': [34], 'parliament': [35], 'representing': [36], 'districts': [37], 'cities': [38], 'boroughs': [39], 'met': [40], 'time': [41, 265], '1801': [42], 'john': [43], 'marshall': [44], 'appointed': [45], 'us': [46, 50, 114], 'chief': [47, 74], 'justice': [48], '1887': [49], 'senate': [51], 'approved': [52], 'agreement': [53], 'lease': [54], 'pearl': [55], 'harbor': [56], 'hawaii': [57], 'naval': [58], 'base': [59], '1936': [60], 'britains':

# Creating index and catalog files in chunks of 1000 for stemmed docs

In [21]:
import os

def write_to_index_file(index_file, index_entries, doc_map):
    for term, postings in index_entries.items():
        postings_str = ','.join([f"{doc_map[doc_id]}:[{','.join(map(str, positions))}]" for doc_id, positions in postings.items()])
        index_file.write(f"{postings_str},")

def write_to_catalog_file(catalog_file, term_offsets):
    for term, (offset, size) in sorted(term_offsets.items()): 
        catalog_file.write(f"{term} {offset} {size}\n")

folder_path_index = "./stemmed_index_files/index/"
folder_path_catalog = "./stemmed_index_files/catalog/"
os.makedirs(folder_path_index, exist_ok=True)
os.makedirs(folder_path_catalog, exist_ok=True)

chunk_size = 1000

current_index_chunk = 1
index_entries = {}
index_offset = 0
doc_map = {}
term_offsets = {}

for i, (doc_id, terms) in enumerate(tokenized_dict_stemmed_map_data.items(), start=1):
    doc_map[doc_id] = i
    for term, positions in terms.items():
        if term not in index_entries:
            index_entries[term] = {}
        index_entries[term][doc_id] = positions

    if i % chunk_size == 0:
        index_file_path = os.path.join(folder_path_index, f"index_chunk_{current_index_chunk}.txt")
        catalog_file_path = os.path.join(folder_path_catalog, f"catalog_chunk_{current_index_chunk}.txt")
        index_offset = 0
        with open(index_file_path, "a") as index_file, open(catalog_file_path, "a") as catalog_file:
            for term, postings in index_entries.items():
                write_to_index_file(index_file, {term: postings}, doc_map)
                term_offsets[term] = (index_offset, index_file.tell() - index_offset)
                index_offset = index_file.tell()
            
            write_to_catalog_file(catalog_file, term_offsets)

        index_entries = {}
        term_offsets = {}
        current_index_chunk += 1

if index_entries:
    index_file_path = os.path.join(folder_path_index, f"index_chunk_{current_index_chunk}.txt")
    catalog_file_path = os.path.join(folder_path_catalog, f"catalog_chunk_{current_index_chunk}.txt")
    index_offset = 0
    with open(index_file_path, "a") as index_file, open(catalog_file_path, "a") as catalog_file:
        for term, postings in index_entries.items():
            write_to_index_file(index_file, {term: postings}, doc_map)
            term_offsets[term] = (index_offset, index_file.tell() - index_offset)
            index_offset = index_file.tell() 
        
        write_to_catalog_file(catalog_file, term_offsets)

print("Indexing completed successfully.")


Indexing completed successfully.


# Creating index and catalog files in chunks of 1000 for unstemmed docs

In [22]:
import os

def write_to_index_file(index_file, index_entries, doc_map):
    for term, postings in index_entries.items():
        postings_str = ','.join([f"{doc_map[doc_id]}:[{','.join(map(str, positions))}]" for doc_id, positions in postings.items()])
        index_file.write(f"{postings_str},")

def write_to_catalog_file(catalog_file, term_offsets):
    for term, (offset, size) in sorted(term_offsets.items()): 
        catalog_file.write(f"{term} {offset} {size}\n")


folder_path_index = "./non_stemmed_index_files/index/"
folder_path_catalog = "./non_stemmed_index_files/catalog/"
os.makedirs(folder_path_index, exist_ok=True)
os.makedirs(folder_path_catalog, exist_ok=True)


chunk_size = 1000


current_index_chunk = 1
index_entries = {}
index_offset = 0
doc_map = {}
term_offsets = {}

for i, (doc_id, terms) in enumerate(tokenized_dict_nonstem_map_data.items(), start=1):
    doc_map[doc_id] = i
    for term, positions in terms.items():
        if term not in index_entries:
            index_entries[term] = {}
        index_entries[term][doc_id] = positions

    if i % chunk_size == 0:
        index_file_path = os.path.join(folder_path_index, f"index_chunk_{current_index_chunk}.txt")
        catalog_file_path = os.path.join(folder_path_catalog, f"catalog_chunk_{current_index_chunk}.txt")
        index_offset = 0
        with open(index_file_path, "a") as index_file, open(catalog_file_path, "a") as catalog_file:
            for term, postings in index_entries.items():
                write_to_index_file(index_file, {term: postings}, doc_map)
                term_offsets[term] = (index_offset, index_file.tell() - index_offset)
                index_offset = index_file.tell() 
            
            write_to_catalog_file(catalog_file, term_offsets)

        index_entries = {}
        term_offsets = {}
        current_index_chunk += 1

if index_entries:
    index_file_path = os.path.join(folder_path_index, f"index_chunk_{current_index_chunk}.txt")
    catalog_file_path = os.path.join(folder_path_catalog, f"catalog_chunk_{current_index_chunk}.txt")
    index_offset = 0
    with open(index_file_path, "a") as index_file, open(catalog_file_path, "a") as catalog_file:
        for term, postings in index_entries.items():
            write_to_index_file(index_file, {term: postings}, doc_map)
            term_offsets[term] = (index_offset, index_file.tell() - index_offset)
            index_offset = index_file.tell()  
        
        write_to_catalog_file(catalog_file, term_offsets)

print("Indexing completed successfully.")


Indexing completed successfully.


# Creating doc_map

In [23]:
doc_map_file_path = "./stemmed_index_files/doc_map.txt"

with open(doc_map_file_path, 'w') as doc_map_file:
    for doc_id, index in doc_map.items():
        doc_map_file.write(f"{doc_id}: {index}\n")

print("doc_map stored successfully in doc_map.txt file.")

doc_map stored successfully in doc_map.txt file.


# Merging using merge sort

In [24]:
def merge_files(index_file_1, index_file_2, catalog_file_1, catalog_file_2, merged_index_file, merged_terms_file):
    def read_content_from_file(file_path, offset, size):
        with open(file_path, 'r') as file:
            file.seek(offset)
            content = file.read(size)
        return content
    offset = 0
    with open(catalog_file_1, 'r') as catalog1, open(catalog_file_2, 'r') as catalog2, \
         open(index_file_1, 'r') as index1, open(index_file_2, 'r') as index2, \
         open(merged_index_file, 'w') as merged_index_f, open(merged_terms_file, 'w') as merged_terms_f:

        line1 = catalog1.readline().strip()
        line2 = catalog2.readline().strip()
        while line1 and line2:
            parts1 = line1.split()
            parts2 = line2.split()

            term1, offset1, size1 = parts1
            term2, offset2, size2 = parts2

            if term1 < term2:
                content = read_content_from_file(index_file_1, int(offset1), int(size1))
                merged_index_f.write(content)
                merged_terms_f.write(f"{term1} {offset} {merged_index_f.tell() - int(offset)}" + '\n')
                offset = merged_index_f.tell()
                line1 = catalog1.readline().strip()
            elif term1 > term2:
                content = read_content_from_file(index_file_2, int(offset2), int(size2))
                merged_index_f.write(content)
                merged_terms_f.write(f"{term2} {offset} {merged_index_f.tell() - int(offset)}" + '\n')
                offset = merged_index_f.tell()
                line2 = catalog2.readline().strip()
            else:  
                content1 = read_content_from_file(index_file_1, int(offset1), int(size1))
                content2 = read_content_from_file(index_file_2, int(offset2), int(size2))
                merged_index_f.write(content1 + content2)
                merged_terms_f.write(f"{term1} {offset} {merged_index_f.tell() - int(offset)}" + '\n')
                offset = merged_index_f.tell()
                line1 = catalog1.readline().strip()
                line2 = catalog2.readline().strip()

        while line1:
            term1, offset1, size1 = line1.split()
            content = read_content_from_file(index_file_1, int(offset1), int(size1))
            merged_index_f.write(content)
            merged_terms_f.write(f"{term1} {offset} {merged_index_f.tell() - int(offset)}" + '\n')
            offset = merged_index_f.tell()
            line1 = catalog1.readline().strip()

        while line2:
            term2, offset2, size2 = line2.split()
            content = read_content_from_file(index_file_2, int(offset2), int(size2))
            merged_index_f.write(content)
            merged_terms_f.write(f"{term2} {offset} {merged_index_f.tell() - int(offset)}" + '\n')
            offset = merged_index_f.tell()
            line2 = catalog2.readline().strip()

folder_path_index = "./stemmed_index_files/index/"
folder_path_catalog = "./stemmed_index_files/catalog/"

final_merged_index_file = "./stemmed_index_files/final_merged_index_file.txt"
final_merged_catalog_file = "./stemmed_index_files/final_merged_catalog_file.txt"

merged_index_file = "./stemmed_index_files/merged_index_file.txt"
merged_catalog_file = "./stemmed_index_files/merged_catalog_file.txt"

open(final_merged_index_file, 'a').close()
open(final_merged_catalog_file, 'a').close()
open(merged_index_file, 'a').close()
open(merged_catalog_file, 'a').close()

def sort_files(file_path):
    return int(os.path.splitext(os.path.basename(file_path))[0].split('_')[-1])

index_files = [os.path.join(folder_path_index, file) for file in sorted(os.listdir(folder_path_index), key=sort_files)]
catalog_files = [os.path.join(folder_path_catalog, file) for file in sorted(os.listdir(folder_path_catalog), key=sort_files)]

for i in range(len(index_files)):
    print(index_files[i], catalog_files[i])
    merge_files(merged_index_file, index_files[i],  merged_catalog_file, catalog_files[i], final_merged_index_file, final_merged_catalog_file)

    with open(final_merged_index_file, 'r') as final_merged_index_f, open(merged_index_file, 'w') as merged_index_f:
        merged_index_f.write(final_merged_index_f.read())
    
    with open(final_merged_catalog_file, 'r') as final_merged_catalog_f, open(merged_catalog_file, 'w') as merged_catalog_f:
        merged_catalog_f.write(final_merged_catalog_f.read())

print("Merging completed. Merged index and terms files are saved.")

./stemmed_index_files/index/index_chunk_1.txt ./stemmed_index_files/catalog/catalog_chunk_1.txt
./stemmed_index_files/index/index_chunk_2.txt ./stemmed_index_files/catalog/catalog_chunk_2.txt
./stemmed_index_files/index/index_chunk_3.txt ./stemmed_index_files/catalog/catalog_chunk_3.txt
./stemmed_index_files/index/index_chunk_4.txt ./stemmed_index_files/catalog/catalog_chunk_4.txt
./stemmed_index_files/index/index_chunk_5.txt ./stemmed_index_files/catalog/catalog_chunk_5.txt
./stemmed_index_files/index/index_chunk_6.txt ./stemmed_index_files/catalog/catalog_chunk_6.txt
./stemmed_index_files/index/index_chunk_7.txt ./stemmed_index_files/catalog/catalog_chunk_7.txt
./stemmed_index_files/index/index_chunk_8.txt ./stemmed_index_files/catalog/catalog_chunk_8.txt
./stemmed_index_files/index/index_chunk_9.txt ./stemmed_index_files/catalog/catalog_chunk_9.txt
./stemmed_index_files/index/index_chunk_10.txt ./stemmed_index_files/catalog/catalog_chunk_10.txt
./stemmed_index_files/index/index_chun

./stemmed_index_files/index/index_chunk_85.txt ./stemmed_index_files/catalog/catalog_chunk_85.txt
Merging completed. Merged index and terms files are saved.


# Merging unstemmed docs

In [25]:
folder_path_index = "./non_stemmed_index_files/index/"
folder_path_catalog = "./non_stemmed_index_files/catalog/"

final_merged_index_file = "./non_stemmed_index_files/final_merged_index_file.txt"
final_merged_catalog_file = "./non_stemmed_index_files/final_merged_catalog_file.txt"

merged_index_file = "./non_stemmed_index_files/merged_index_file.txt"
merged_catalog_file = "./non_stemmed_index_files/merged_catalog_file.txt"

open(final_merged_index_file, 'a').close()
open(final_merged_catalog_file, 'a').close()
open(merged_index_file, 'a').close()
open(merged_catalog_file, 'a').close()

def sort_files(file_path):
    return int(os.path.splitext(os.path.basename(file_path))[0].split('_')[-1])

index_files = [os.path.join(folder_path_index, file) for file in sorted(os.listdir(folder_path_index), key=sort_files)]
catalog_files = [os.path.join(folder_path_catalog, file) for file in sorted(os.listdir(folder_path_catalog), key=sort_files)]

for i in range(len(index_files)):
    print(index_files[i], catalog_files[i])
    merge_files(merged_index_file, index_files[i],  merged_catalog_file, catalog_files[i], final_merged_index_file, final_merged_catalog_file)

    with open(final_merged_index_file, 'r') as final_merged_index_f, open(merged_index_file, 'w') as merged_index_f:
        merged_index_f.write(final_merged_index_f.read())
    
    with open(final_merged_catalog_file, 'r') as final_merged_catalog_f, open(merged_catalog_file, 'w') as merged_catalog_f:
        merged_catalog_f.write(final_merged_catalog_f.read())

print("Merging completed. Merged index and terms files are saved.")

./non_stemmed_index_files/index/index_chunk_1.txt ./non_stemmed_index_files/catalog/catalog_chunk_1.txt
./non_stemmed_index_files/index/index_chunk_2.txt ./non_stemmed_index_files/catalog/catalog_chunk_2.txt
./non_stemmed_index_files/index/index_chunk_3.txt ./non_stemmed_index_files/catalog/catalog_chunk_3.txt
./non_stemmed_index_files/index/index_chunk_4.txt ./non_stemmed_index_files/catalog/catalog_chunk_4.txt
./non_stemmed_index_files/index/index_chunk_5.txt ./non_stemmed_index_files/catalog/catalog_chunk_5.txt
./non_stemmed_index_files/index/index_chunk_6.txt ./non_stemmed_index_files/catalog/catalog_chunk_6.txt
./non_stemmed_index_files/index/index_chunk_7.txt ./non_stemmed_index_files/catalog/catalog_chunk_7.txt
./non_stemmed_index_files/index/index_chunk_8.txt ./non_stemmed_index_files/catalog/catalog_chunk_8.txt
./non_stemmed_index_files/index/index_chunk_9.txt ./non_stemmed_index_files/catalog/catalog_chunk_9.txt
./non_stemmed_index_files/index/index_chunk_10.txt ./non_stemmed

./non_stemmed_index_files/index/index_chunk_79.txt ./non_stemmed_index_files/catalog/catalog_chunk_79.txt
./non_stemmed_index_files/index/index_chunk_80.txt ./non_stemmed_index_files/catalog/catalog_chunk_80.txt
./non_stemmed_index_files/index/index_chunk_81.txt ./non_stemmed_index_files/catalog/catalog_chunk_81.txt
./non_stemmed_index_files/index/index_chunk_82.txt ./non_stemmed_index_files/catalog/catalog_chunk_82.txt
./non_stemmed_index_files/index/index_chunk_83.txt ./non_stemmed_index_files/catalog/catalog_chunk_83.txt
./non_stemmed_index_files/index/index_chunk_84.txt ./non_stemmed_index_files/catalog/catalog_chunk_84.txt
./non_stemmed_index_files/index/index_chunk_85.txt ./non_stemmed_index_files/catalog/catalog_chunk_85.txt
Merging completed. Merged index and terms files are saved.


# Compression into one file

In [26]:
import gzip

input_file_path = "./stemmed_index_files/final_merged_index_file.txt"
output_file_path = "./stemmed_index_files/final_merged_index_file.gz"

with open(input_file_path, 'rb') as input_file, gzip.open(output_file_path, 'wb') as output_file:
    output_file.write(input_file.read())

print("Compression completed successfully.")

Compression completed successfully.


In [28]:
document_lengths_stemmed = {doc_id: len(tokens) for doc_id, tokens in tokenized_dict_stemmed_map.items()}

with open("./stemmed_index_files/tokenized_dict_stemmed_length_map.txt", "w") as file:
    for doc_id, length in document_lengths_stemmed.items():
        file.write(f"{doc_id} {length}\n")

In [36]:
document_lengths_non_stemmed = {doc_id: len(tokens) for doc_id, tokens in tokenized_dict_nonstem_map.items()}

with open("./non_stemmed_index_files/tokenized_dict_non_stemmed_length_map.txt", "w") as file:
    for doc_id, length in document_lengths_non_stemmed.items():
        file.write(f"{doc_id} {length}\n")

# Compression into chunks

In [57]:
catalog_file_path = "./stemmed_index_files/final_merged_catalog_file.txt"
chunks = {}

with open(catalog_file_path, 'r') as file:
    lines = file.readlines()
    total_lines = len(lines)

    lines_per_chunk = total_lines // 3
    remainder_lines = total_lines % 3

    start_index = 0
    for i in range(3):
        chunk_lines = lines_per_chunk
        if i < remainder_lines:
            chunk_lines += 1
        end_index = min(start_index + chunk_lines, total_lines) 
        if end_index >= len(lines):  
            end_index = len(lines)-1
            
        catalog_line_parts = lines[int(end_index)].split()
        word = catalog_line_parts[0]
        
        offset = catalog_line_parts[1]
        size = catalog_line_parts[2]
        chunks[word] = (offset, size)
        
        start_index = end_index

print("Total number of lines:", total_lines)
print("Lines per chunk:", lines_per_chunk)
print("Chunks:", chunks)

Total number of lines: 217712
Lines per chunk: 72570
Chunks: {'chesebroughpond': ('35891885', '110'), 'motionless': ('111511757', '224'), 'zzzz': ('190606424', '189')}


In [58]:
input_file_path = "./stemmed_index_files/final_merged_index_file.txt"
output_dir = "./stemmed_index_files/compressed_index_files/"
os.makedirs(output_dir, exist_ok=True)

compressed_chunks = {}
file_offset = 0
for i, (chunk_key, (offset, size)) in enumerate(chunks.items(), start=1):
    with open(input_file_path, 'rb') as input_file:
        input_file.seek(file_offset)
        chunk_data = input_file.read(int(offset) + int(size))

    compressed_file_path = os.path.join(output_dir, f"chunk_{i}_compressed.gz")
    with gzip.open(compressed_file_path, 'wb') as output_file:
        output_file.write(chunk_data)

    compressed_chunks[chunk_key] = {
        'file_path': compressed_file_path,
        'file_offset': file_offset
    }
    file_offset = int(offset) + int(size)

print("Compression completed successfully.")
print("Compressed chunks:", compressed_chunks)

Compression completed successfully.
Compressed chunks: {'chesebroughpond': {'file_path': './stemmed_index_files/compressed_index_files/chunk_1_compressed.gz', 'file_offset': 0}, 'motionless': {'file_path': './stemmed_index_files/compressed_index_files/chunk_2_compressed.gz', 'file_offset': 35891995}, 'zzzz': {'file_path': './stemmed_index_files/compressed_index_files/chunk_3_compressed.gz', 'file_offset': 111511981}}


In [59]:
import json

output_file_path = "./stemmed_index_files/compressed_chunks.json"
with open(output_file_path, 'w') as output_file:
    json.dump(compressed_chunks, output_file)

print("Compressed chunks written to:", output_file_path)

Compressed chunks written to: ./stemmed_index_files/compressed_chunks.json
