In [1]:
# Imports
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import pickle
import string
from collections import defaultdict

In [2]:
# Common setup for data pre-processing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

### Documents

In [3]:
# Load original documents
with open("../dataset/original/CISI.ALL") as f:
    articles = f.readlines()

In [4]:
def extract_docs(lines):
    title_mode = False
    body_mode = False
    edge_mode = False
    
    title = ""
    body = ""
    
    idx = None
    
    edge_str = ""
    edges = []
    docs = []
    
    for line in lines:
        for c in line:
            if line.startswith("."):
                if line.startswith(".I"):
                    for e in edge_str.split("\n"):
                        if "\t" in e:
                            edges.append((idx, int(e.split("\t")[0])))
                    idx = int(line.split()[1])
                    edge_str = ""
                    edge_mode = False
                    
                if line.startswith(".T"):
                    title_mode = True
                    body_mode = False
                    
                elif line.startswith(".W"):
                    title_mode = False
                    body_mode = True
                    
                elif line.startswith(".X"):
                    docs.append({"id": idx, "title": title, "body": body})
                    title = ""
                    body = ""
                    title_mode = False
                    body_mode = False
                    edge_mode = True
                    
                else:
                    title_mode = False
                    body_mode = False 
                    edge_mode = False
    
            if title_mode:
                title += c
            elif body_mode:
                body += c
            elif edge_mode:
                edge_str += c
                
    for e in edge_str.split("\n"):
        if "\t" in e:
            edges.append((idx, int(e.split("\t")[0])))
            
    return [x for x in docs if x["title"]], sorted(list(set(edges)))

In [5]:
# Extract id, title, and body for each document
d, _ = extract_docs(articles)
docs = {x["id"]: {"text": x["title"] + "\n" + x["body"]} for x in d}

In [6]:
# Sanity check
docs[1], len(docs)

{'text': ".T\n18 Editions of the Dewey Decimal Classifications\n\n.W\n   The present study is a history of the DEWEY Decimal\nClassification.  The first edition of the DDC was published\nin 1876, the eighteenth edition in 1971, and future editions\nwill continue to appear as needed.  In spite of the DDC's\nlong and healthy life, however, its full story has never\nbeen told.  There have been biographies of Dewey\nthat briefly describe his system, but this is the first\nattempt to provide a detailed history of the work that\nmore than any other has spurred the growth of\nlibrarianship in this country and abroad.\n"}

In [8]:
# Data pre-processing (Used only for LSI)
def clean_text(sentence):
    # Remove punctuations
    lookup_table = sentence.maketrans("", "", string.punctuation)
    clean_text = sentence.translate(lookup_table)

    # Convert to lowercase and tokenize into words
    word_list = word_tokenize(clean_text.lower())

    # Remove stop-words and words with length less than or equal to 2
    word_list = [w for w in word_list if not w in stop_words and len(w) > 2]

    # Reduce each word to its lemma
    word_list = [lemmatizer.lemmatize(word) for word in word_list]

    # Convert back to sentence
    clean_sentence = " ".join(word_list)
    
    return clean_sentence

In [9]:
# Pre-process documents
for i, doc in docs.items():
    docs[i]["clean_text"] = clean_text(doc["text"])

In [10]:
# Sanity check
docs[1]

{'text': ".T\n18 Editions of the Dewey Decimal Classifications\n\n.W\n   The present study is a history of the DEWEY Decimal\nClassification.  The first edition of the DDC was published\nin 1876, the eighteenth edition in 1971, and future editions\nwill continue to appear as needed.  In spite of the DDC's\nlong and healthy life, however, its full story has never\nbeen told.  There have been biographies of Dewey\nthat briefly describe his system, but this is the first\nattempt to provide a detailed history of the work that\nmore than any other has spurred the growth of\nlibrarianship in this country and abroad.\n",
 'clean_text': 'edition dewey decimal classification present study history dewey decimal classification first edition ddc published 1876 eighteenth edition 1971 future edition continue appear needed spite ddc long healthy life however full story never told biography dewey briefly describe system first attempt provide detailed history work spurred growth librarianship country 

In [11]:
# Save documents to file
with open("../dataset/documents.pkl", "wb") as f:
    pickle.dump(docs, f)

### Queries

In [12]:
queries = {}

In [13]:
# Load original queries
idx = None
with open("../dataset/original/CISI.QRY") as f:
    for query in f.read().split(".I"):
        for i, line in enumerate(query.split("\n")):
            if not line:
                continue
            elif i == 0:
                idx = int(line)
                queries[idx] = {"text": ""}
            elif not line.startswith("."):
                queries[idx]["text"] += " " + line

In [14]:
# Sanity check
queries[1], len(queries)

{'text': ' What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles?'}

In [16]:
# Pre-process queries
for i, query in queries.items():
    queries[i]["clean_text"] = clean_text(query["text"])

In [17]:
# Sanity check
queries[1]

{'text': ' What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles?',
 'clean_text': 'problem concern making descriptive title difficulty involved automatically retrieving article approximate title usual relevance content article title'}

In [18]:
# Save queries to file
with open("../dataset/queries.pkl", "wb") as f:
    pickle.dump(queries, f)

### Relevance Set (Ground Truth)

In [2]:
# Load original relevance set
with open("../dataset/original/CISI.REL") as f:
    lines = f.read().split("\n")[:-1]

In [6]:
with open("../dataset/original/CISI.REL") as f:
    lines = f.read().split('\n')[:-1]
    ground_truth = [[]]*len(lines)
    for line in lines:
        clean_line = line.strip().replace('\t',' ').split()
        query, doc = [int(num.replace(' ','')) for num in clean_line[:2]]
        ground_truth[query].append(doc)

In [4]:
# Extract the ground truth documents for each query
ground_truth = defaultdict(list)
for line in lines:
    clean_line = line.strip().replace('\t',' ').split()
    query, doc = [int(num.replace(' ','')) for num in clean_line[:2]]
    ground_truth[query].append(doc)

In [14]:
# Sanity check
final_dict = dict(enumerate(ground_truth))

In [15]:
len(final_dict[1])

3114

In [16]:
# Save relevance set to file
with open("../dataset/rel_set.pkl", "wb") as f:
    pickle.dump(final_dict, f)