In [1]:
import os
import re
import sys
from collections import defaultdict

In [2]:
def sanitize(text):
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    return text.lower()

In [3]:
def generate_bigram_mapping(folder, target_set):
    mapping = defaultdict(lambda: defaultdict(int))
    for file_entry in os.listdir(folder):
        full_path = os.path.join(folder, file_entry)
        if os.path.isfile(full_path):
            with open(full_path, 'r', encoding='utf-8') as reader:
                for line in reader:
                    parts = line.split('\t', 1)
                    if len(parts) < 2:
                        continue
                    doc_key = parts[0]
                    content = parts[1]
                    fixed_text = sanitize(content)
                    words = fixed_text.split()
                    for i in range(len(words) - 1):
                        w1 = words[i]
                        w2 = words[i+1]
                        if w1 and w2:
                            bi = f"{w1} {w2}"
                            if bi in target_set:
                                mapping[bi][doc_key] += 1
    return mapping

In [4]:
def compile_results(mapping):
    output = []
    for phrase, subdict in sorted(mapping.items()):
        counts = [f"{doc}:{cnt}" for doc, cnt in sorted(subdict.items())]
        output.append(f"{phrase}\t{' '.join(counts)}")
    return output

In [5]:
def main():
    if not os.path.exists('devdata'):
        print("Missing 'devdata' directory.")
        sys.exit(1)
    bigram_targets = {"computer science", "information retrieval", "power politics", "los angeles", "bruce willis"}
    bi_mapping = generate_bigram_mapping('devdata', bigram_targets)
    result_lines = compile_results(bi_mapping)
    with open('selected_bigram_index.txt', 'w', encoding='utf-8') as outfile:
        outfile.write('\n'.join(result_lines))
    print("Bigram index created with", len(result_lines), "entries.")

In [6]:
if __name__ == "__main__":
    main()

Bigram index created with 5 entries.
