# Assignment 2: IR

## Preparations
* Put all your imports, and path constants in the next cells
* Make sure all your path constants are **relative to** ***DATA_DIR*** and **NOT hard-coded** in your code.

In [1]:
# imports
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess

In [2]:
DATA_DIR = "government"
#
DOCUMENTS_DIR = os.path.join(DATA_DIR, "documents")
TOPIC_FILE = os.path.join(DATA_DIR, "gov.topics")
QRELS_FILE = os.path.join(DATA_DIR, "gov.qrels")
TREC_EVAL = os.path.join("trec_eval", "trec_eval.exe")

## Question 1
Provide your text answers in the following two markdown cells

### Q1 (a): Provide answer to Q1 (a) here [markdown cell]

MAP

### Q1 (b): Provide answer to Q1 (b) here [markdown cell]

Mean average precision weights high ranked result more, and high score indicates good result.

## Question 2

### Q2 (a): Write your code below

In [3]:
# Put your code for creating the index here (you can add more cells).
# Make sure you save the final index in the variable INDEX_Q2, your query parser in QP_Q2, and your searcher in SEARCHER_Q2
def createIndex(schema):
    # Generate a temporary directory for the index
    indexDir = tempfile.mkdtemp()

    # create and return the index
    return index.create_in(indexDir, schema)

In [4]:
# first, define a Schema for the index
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))

# now, create the index at the path INDEX_DIR based on the new schema
myIndex = createIndex(mySchema)

In [5]:
def addFilesToIndex(indexObj, fileList):
    # open writer
    writer = writing.BufferedWriter(indexObj, period=None, limit=1000)

    try:
        # write each file to index
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                writer.add_document(file_path = filePath,
                                    file_content = fileContent)

                # print status every 1000 documents
                if ((docNum+1) % 1000 == 0):
                    print("already indexed:", docNum+1)
        print("done indexing.")

    finally:
        # close the index
        writer.close()

In [6]:
# Build a list of files to index
filesToIndex = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]

In [7]:
# Check the list
filesToIndex[:5]

['government\\documents\\00\\G00-00-0088569',
 'government\\documents\\00\\G00-00-0114013',
 'government\\documents\\00\\G00-00-0124389',
 'government\\documents\\00\\G00-00-0158061',
 'government\\documents\\00\\G00-00-0165832']

In [8]:
# count files to index
print("number of files:", len(filesToIndex))

number of files: 4078


In [9]:
addFilesToIndex(myIndex, filesToIndex)

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [10]:
# define a query parser for the field "file_content" in the index
myQueryParser = QueryParser("file_content", schema=myIndex.schema)
mySearcher = myIndex.searcher()

In [11]:
# print the topic file
with open(TOPIC_FILE, "r") as f:
    print(f.read())

1 mining gold silver coal
2 juvenile delinquency
4 wireless communications
6 physical therapists
7 cotton industry
9 genealogy searches
10 Physical Fitness
14 Agricultural biotechnology
16 Emergency and disaster preparedness assistance
18 Shipwrecks
19 Cybercrime, internet fraud, and cyber fraud
22 Veteran's Benefits
24 Air Bag Safety
26 Nuclear power plants
28 Early Childhood Education



In [12]:
def trecEval(topicFile, qrelsFile, queryParser, searcher):
    # Load topic file - a list of topics(search phrases) used for evalutation
    with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()

    # create an output file to which we'll write our results
    tempOutputFile = tempfile.mkstemp()[1]
    with open(tempOutputFile, "w") as outputTRECFile:
        # for each evaluated topic:
        # build a query and record the results in the file in TREC_EVAL format
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ",1))
            topicQuery = queryParser.parse(topic_phrase)
            topicResults = searcher.search(topicQuery, limit=None)
            for (docnum, result) in enumerate(topicResults):
                score = topicResults.score(docnum)
                outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
    result = subprocess.run([TREC_EVAL, '-q', qrelsFile, tempOutputFile], stdout=subprocess.PIPE)
    print(result.stdout.decode())

In [13]:
trecEval(TOPIC_FILE, QRELS_FILE, myQueryParser, mySearcher) 

num_ret               	1	1
num_rel               	1	5
num_rel_ret           	1	0
map                   	1	0.0000
Rprec                 	1	0.0000
bpref                 	1	0.0000
recip_rank            	1	0.0000
iprec_at_recall_0.00  	1	0.0000
iprec_at_recall_0.10  	1	0.0000
iprec_at_recall_0.20  	1	0.0000
iprec_at_recall_0.30  	1	0.0000
iprec_at_recall_0.40  	1	0.0000
iprec_at_recall_0.50  	1	0.0000
iprec_at_recall_0.60  	1	0.0000
iprec_at_recall_0.70  	1	0.0000
iprec_at_recall_0.80  	1	0.0000
iprec_at_recall_0.90  	1	0.0000
iprec_at_recall_1.00  	1	0.0000
P_5                   	1	0.0000
P_10                  	1	0.0000
P_15                  	1	0.0000
P_20                  	1	0.0000
P_30                  	1	0.0000
P_100                 	1	0.0000
P_200                 	1	0.0000
P_500                 	1	0.0000
P_1000                	1	0.0000
num_ret               	10	16
num_rel               	10	1
num_rel_ret           	10	1
map                   	10	0.1667
Rprec                 	10	0.0000


In [14]:
# Is it empty?
print("Index is empty?", myIndex.is_empty())

# How many files indexed?
print("Number of indexed files:", myIndex.doc_count())

Index is empty? False
Number of indexed files: 4078


In [15]:
# define a reader object on the index
myReader = myIndex.reader()

In [16]:
# run a sample query for the phrase "cotton"
sampleQuery = myQueryParser.parse("juvenile delinquency")
sampleQueryResults = mySearcher.search(sampleQuery, limit=None)

# inspect the result:
# for each document print the rank and the score
for (docnum, result) in enumerate(sampleQueryResults):
    score = sampleQueryResults.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

G00-22-3396139 0 17.26213868755707
G00-76-0415824 1 10.597054574626043
G00-78-1531079 2 8.77864826829756
G00-15-1718631 3 8.076859679154353
G00-70-2787853 4 6.788751086401821
G00-74-1394517 5 3.368379609499579


In [17]:
# how many documents have the phares "bit", blob"
#   in the field "file_content"?
print("# docs with 'juvenile'", myReader.doc_frequency("file_content", "juvenile"))
print("# docs with 'delinquency'", myReader.doc_frequency("file_content", "delinquency"))
print("# docs with 'juvenile delinquency'", myReader.doc_frequency("file_content", "juvenile delinquency"))

# docs with 'juvenile' 19
# docs with 'delinquency' 6
# docs with 'juvenile delinquency' 0


In [18]:
INDEX_Q2 = myIndex # Replace None with your index for Q2
QP_Q2 = myQueryParser # Replace None with your query parser for Q2
SEARCHER_Q2 = mySearcher # Replace None with your searcher for Q2

### Q2 (b): Provide answer to Q2 (b) here [markdown cell]

It didn't do so well, by considering MAP scored 0.1971/1, there is still a lot can be improved.


### Q2 (c): Provide answer to Q2(c) here [markdown cell]

Topic 1 2 6 7 9 16 28 are bad because they scored zero, which mean all retrived documents are irrelevent.      
Topic 18 and 24 did very well because the most relevant documents ranked on the top.


## Question 3

### Q3 (a): Provide answer to Q3 (a) here [markdown cell]

For example, in topic 2-juvenile delinquency, No 1 ranked doc G00-22-3396139 scored 17.2 in the baseline Whoosh system, but this document is actually irrelevant. This is a case of false positive (FP). Although words “juvenile” and “delinquency” appeared in the documents, the document just listed some government agencies which includes “the Office of Juvenile Justice and Delinquency Prevention” twice. In addition, this document is very short which means high term frequency for “juvenile delinquency”.  However, this article in general is just talking about OFFICE OF JUSTICE PROGRAMS. Similarly, there are lots of documents that include topics’ key words but they are irrelevant.  

Other the other hand, doc G00-37-1427392 is relevant but not been queried. This is a case of false negative (FN). The reason why this document ranked low is that no key words are been matched. With the base line RegexTokenizer, searcher was not able to detect Capitalized “juvenile” and “delinquency”. By implementing more advanced tokenizer, false negative should be reduced, and overall result should be improved.


### Q3 (b): Write your code below

In [19]:
# Put your code for creating the index here (you can add more cells).
# Make sure you save the final index in the variable INDEX_Q3, your query parser in QP_Q3, and your searcher in SEARCHER_Q3
mynewSchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter()))

# now, create the index at the path INDEX_DIR based on the new schema
mynewIndex = createIndex(mynewSchema)

In [20]:
addFilesToIndex(mynewIndex, filesToIndex)
mynewQueryParser = QueryParser("file_content", schema=mynewIndex.schema)
mynewSearcher = mynewIndex.searcher()

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [21]:
trecEval(TOPIC_FILE, QRELS_FILE, mynewQueryParser, mynewSearcher) 

num_ret               	1	3
num_rel               	1	5
num_rel_ret           	1	0
map                   	1	0.0000
Rprec                 	1	0.0000
bpref                 	1	0.0000
recip_rank            	1	0.0000
iprec_at_recall_0.00  	1	0.0000
iprec_at_recall_0.10  	1	0.0000
iprec_at_recall_0.20  	1	0.0000
iprec_at_recall_0.30  	1	0.0000
iprec_at_recall_0.40  	1	0.0000
iprec_at_recall_0.50  	1	0.0000
iprec_at_recall_0.60  	1	0.0000
iprec_at_recall_0.70  	1	0.0000
iprec_at_recall_0.80  	1	0.0000
iprec_at_recall_0.90  	1	0.0000
iprec_at_recall_1.00  	1	0.0000
P_5                   	1	0.0000
P_10                  	1	0.0000
P_15                  	1	0.0000
P_20                  	1	0.0000
P_30                  	1	0.0000
P_100                 	1	0.0000
P_200                 	1	0.0000
P_500                 	1	0.0000
P_1000                	1	0.0000
num_ret               	10	42
num_rel               	10	1
num_rel_ret           	10	1
map                   	10	0.2500
Rprec                 	10	0.0000


In [22]:
# run a sample query for the phrase "cotton"
samplenewQuery = mynewQueryParser.parse("juvenile delinquency")
samplenewQueryResults = mynewSearcher.search(samplenewQuery, limit=None)

# inspect the result:
# for each document print the rank and the score
for (docnum, result) in enumerate(samplenewQueryResults):
    score = samplenewQueryResults.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

G00-37-1427392 0 19.449213860332442
G00-22-3396139 1 18.434193291655042
G00-78-1531079 2 16.335289209741667
G00-92-0578141 3 15.493780459238984
G00-67-0637954 4 14.85795689433918
G00-91-1567424 5 14.721796656533094
G00-94-1117794 6 13.437583261488802
G00-76-0415824 7 12.54308590417346
G00-15-1718631 8 11.522913461281409
G00-90-3871013 9 11.372035203185476
G00-70-2787853 10 10.268645726545305
G00-27-2159399 11 8.788646752754644
G00-74-1394517 12 4.022868439339689


In [23]:
INDEX_Q3 = mynewIndex # Replace None with your index for Q3
QP_Q3 = mynewQueryParser # Replace None with your query parser for Q3
SEARCHER_Q3 = mynewSearcher # Replace None with your searcher for Q3

### Q3 (c): Provide answer to Q3 (c) here [markdown cell]

As I mentioned I included a more complexed Tokenizer that lowercases everything, strips stop words, break phrases and stems words. By doing so, false negative reduced and both true positive and false positive increased.however, qrery is able to retrive Topic 2 9 28 with relevant result now.

### Q3 (d): Provide answer to Q3 (d) here [markdown cell]

Yes

### Q3 (e): Provide answer to Q3 (e) here [markdown cell]

Yes

### Q3 (f): Provide answer to Q3 (f) here [markdown cell]

It still means good, since the improved quries are improved signifcantlly. Map improved from 0.19 to 0.33. Those quries that got worse only got worse a little bit, and their relevant result still retrived. They only ranked down by one place or two.

## Question 4 (Graduate Students)

In [24]:
GRAD_STUDENT = True # change to True if you are a grad student

### Q4 (a): Provide answer to Q4 (a) here [markdown cell]

Since this alternative method is just a small varition of the Q3's query, result should be pretty similar. With the fine tune of the free peremeter, I am aimming to improve the rank of some of the relevant documents. By tuning B and K1, I can vary the weight of overall document length and term frequency. 

### Q4 (b): Write your code below

In [25]:
# Put your code for creating the index here (you can add more cells).
# Make sure you save the final index in the variable INDEX_Q4, your query parser in QP_Q4, and your searcher in SEARCHER_Q4
# first, define a Schema for the index
from whoosh import scoring
mynextSchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer =  RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter()))

# now, create the index at the path INDEX_DIR based on the new schema
mynextIndex = createIndex(mynextSchema)

In [26]:
addFilesToIndex(mynextIndex, filesToIndex)

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [27]:
mynextQueryParser = QueryParser("file_content", schema=mynextIndex.schema)
#mynextSearcher = mynextIndex.searcher(weighting=scoring.TF_IDF())
mynextSearcher = mynextIndex.searcher(weighting=scoring.BM25F(B=0.3, K1=0.01))

In [28]:
trecEval(TOPIC_FILE, QRELS_FILE, mynextQueryParser, mynextSearcher) 

num_ret               	1	3
num_rel               	1	5
num_rel_ret           	1	0
map                   	1	0.0000
Rprec                 	1	0.0000
bpref                 	1	0.0000
recip_rank            	1	0.0000
iprec_at_recall_0.00  	1	0.0000
iprec_at_recall_0.10  	1	0.0000
iprec_at_recall_0.20  	1	0.0000
iprec_at_recall_0.30  	1	0.0000
iprec_at_recall_0.40  	1	0.0000
iprec_at_recall_0.50  	1	0.0000
iprec_at_recall_0.60  	1	0.0000
iprec_at_recall_0.70  	1	0.0000
iprec_at_recall_0.80  	1	0.0000
iprec_at_recall_0.90  	1	0.0000
iprec_at_recall_1.00  	1	0.0000
P_5                   	1	0.0000
P_10                  	1	0.0000
P_15                  	1	0.0000
P_20                  	1	0.0000
P_30                  	1	0.0000
P_100                 	1	0.0000
P_200                 	1	0.0000
P_500                 	1	0.0000
P_1000                	1	0.0000
num_ret               	10	42
num_rel               	10	1
num_rel_ret           	10	1
map                   	10	0.3333
Rprec                 	10	0.0000


In [29]:
INDEX_Q4 = mynextIndex # Replace None with your index for Q4
QP_Q4 =  mynextQueryParser # Replace None with your query parser for Q4
SEARCHER_Q4 = mynextSearcher # Replace None with your searcher for Q4

### Q4 (c): Provide answer to Q4 (a) here [markdown cell]

 I modified BM25's free peremeters K1 and B. I changed B from 0.75 to 0.3 and K from 1.2 to 0.01. Topic 4 9 10 22 28's relevant documents now ranked higher. TN and FP remains same by comparing with Q3.

### Q4 (d): Provide answer to Q4 (a) here [markdown cell]

Yes

### Q4 (e): Provide answer to Q4 (a) here [markdown cell]

Yes

### Q4 (f): Provide answer to Q4 (a) here [markdown cell]

For this document set and topics, this new alternative method is better. However, this tuning might be biased, and might not work better for a different document set.

## Validation

In [30]:
# Run the following cells to make sure your code returns the correct value types

In [31]:
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher
import os.path

### Q2 Validation

In [32]:
assert(isinstance(INDEX_Q2, FileIndex)), "Index Type"
assert(isinstance(QP_Q2, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q2, Searcher)), "Searcher Type"
print("Q2 Types Validated")

Q2 Types Validated


### Q3 Validation

In [33]:
assert(isinstance(INDEX_Q3, FileIndex)), "Index Type"
assert(isinstance(QP_Q3, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q3, Searcher)), "Searcher Type"
print("Q3 Types Validated")

Q3 Types Validated


### Q4 Validation (Graduate Students)

In [34]:
assert((not GRAD_STUDENT) or isinstance(INDEX_Q4, FileIndex)), "Index Type"
assert((not GRAD_STUDENT) or isinstance(QP_Q4, QueryParser)), "Query Parser Type"
assert((not GRAD_STUDENT) or isinstance(SEARCHER_Q4, Searcher)), "Searcher Type"
print("Q4 Types Validated")

Q4 Types Validated
