In [3]:
import tarfile
import re

folder_path = ''
# Step 1: Extract the .tar.gz file
with tarfile.open(folder_path+"cran.tar.gz", "r:") as tar:
    tar.extractall(folder_path+"cranfield_dataset")

# Paths to the extracted files
docs_path = folder_path+"cranfield_dataset/cran.all.1400"
queries_path = folder_path+"cranfield_dataset/cran.qry"
qrels_path = folder_path+"cranfield_dataset/cranqrel"

# Step 1: Load documents from 'cran.all.1400'
documents = {}
with open(docs_path, 'r') as file:
    content = file.read()
    docs = re.split(r"\.I ", content)[1:]  # Split by document identifier prefix
    for doc in docs:
        lines = doc.splitlines()
        doc_id = lines[0].strip()
        doc_text = "\n".join(lines[2:])  # Skip .T and first line for title/author, rest is content
        documents[doc_id] = doc_text

# Step 2: Load queries from 'cran.qry'
queries = {}
with open(queries_path, 'r') as file:
    content = file.read()
    qry_sections = re.split(r"\.I ", content)[1:]  # Split by query identifier prefix
    for idx, qry in enumerate(qry_sections):
        lines = qry.splitlines()
        query_text = "\n".join(lines[2:])  # Skip the first 2 lines to get query text
        queries[idx + 1] = query_text  # Use a continuous index from 1 to total count

# Step 3: Load and remap relevance judgments from 'cranqrel'
relevance_judgments = {i + 1: set() for i in range(len(queries))}
# query_id_map = {old_id: new_id for new_id, old_id in enumerate(sorted(queries.keys()), start=1)}

with open(qrels_path, 'r') as file:
    for line in file:
        query_id, doc_id, relevance = map(int, line.split())
        # new_query_id = query_id_map.get(query_id)
        if query_id and relevance >= 2:  # Only consider relevance >= 2
            relevance_judgments[query_id].add(str(doc_id))

# Step 4: Define qrels_defs() and metadata() equivalents
def qrels_defs():
    print("Qrels Definitions:")
    print("0: Not Relevant")
    print("1: Marginally Relevant")
    print("2: Relevant")
    print("3: Highly Relevant")

def metadata(documents, queries, relevance_judgments):
    print("Dataset Metadata:")
    print(f"Total number of documents: {len(documents)}")
    print(f"Total number of queries: {len(queries)}")
    total_rels = sum(len(docs) for docs in relevance_judgments.values())
    print(f"Total number of relevance judgments: {total_rels}")
    print(f"Average judgments per query: {total_rels / len(queries):.2f}")

# Call the functions to print qrels definitions and metadata
qrels_defs()
metadata(documents, queries, relevance_judgments)

Qrels Definitions:
0: Not Relevant
1: Marginally Relevant
2: Relevant
3: Highly Relevant
Dataset Metadata:
Total number of documents: 1400
Total number of queries: 225
Total number of relevance judgments: 1484
Average judgments per query: 6.60


  tar.extractall(folder_path+"cranfield_dataset")
