In [11]:
import os
import re
import sys
from collections import defaultdict

In [12]:
def sanitize(text):
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    return text.lower()

In [13]:
def generate_unigram_mapping(folder):
    mapping = defaultdict(lambda: defaultdict(int))
    for entry in os.listdir(folder):
        full_path = os.path.join(folder, entry)
        if os.path.isfile(full_path):
            with open(full_path, 'r', encoding='utf-8') as handle:
                for line in handle:
                    items = line.split('\t', 1)
                    if len(items) < 2:
                        continue
                    doc_id = items[0]
                    content = items[1]
                    fixed_text = sanitize(content)
                    tokens = fixed_text.split()
                    for tok in tokens:
                        tok = tok.strip()
                        if tok:
                            mapping[tok][doc_id] += 1
    return mapping

In [14]:
def compile_results(mapping):
    results = []
    for term, subdict in sorted(mapping.items()):
        counts = [f"{doc}:{cnt}" for doc, cnt in sorted(subdict.items())]
        results.append(f"{term}\t{' '.join(counts)}")
    return results

In [15]:
def main():
    if not os.path.exists('fulldata'):
        print("Missing 'fulldata' directory.")
        sys.exit(1)
    uni_mapping = generate_unigram_mapping('fulldata')
    lines = compile_results(uni_mapping)
    with open('unigram_index.txt', 'w', encoding='utf-8') as out_file:
        out_file.write('\n'.join(lines))
    print("Unigram index created with", len(lines), "entries.")

In [16]:
if __name__ == "__main__":
    main()

Unigram index created with 1076757 entries.
