In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive/Touche

 parse_topics.ipynb    sample_collection_jsonl
 passages.jsonl       't5 base'
 relevant_bm25.csv     topics-task2.xml
 relevant_bm25.jsonl   Touche-21-Task-2
 s2v_old	       touche-task2-passages-version-002.jsonl


In [None]:
# installing linux related stuff for pyserini
# removing error "ImportError: No module named '_swigfaiss"
# Reference Link: https://github.com/facebookresearch/faiss/issues/821
!sudo apt-get install libomp-dev

In [None]:
# installing important packages for analyzing the code.
!pip install jsonlines
!pip install pyserini
!pip install faiss

In [None]:
# import statements
import pandas as pd
import jsonlines
import pyserini
from tqdm import tqdm
from xml.dom import minidom
import re
import nltk
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import STOPWORDS
import string
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
# query expansion corpus
qcorpus_list = list(jsonlines.open('/content/drive/MyDrive/Touche/touche-task2-passages-expanded-with-queries.jsonl'))

In [None]:
# printing demo query
qcorpus_list[0]

"Do Asian-Americans Face Bias in Admissions at Elite Colleges? - NYTimes.com Home Page Today's Paper Video Most Popular Times Topics Search All NYTimes.com Education World U.S. Politics Education Bay Area Chicago Texas N.Y. / Region Business Technology Science Health Sports Opinion Arts Style Travel Jobs Real Estate Autos February 8, 2012, 1:43 pm Do Asian-Americans Face Bias in Admissions at Elite Colleges? By DANIEL E. SLOTNIK 6:08 p.m. | Updated A statement from Princeton was added to the story. The Department of Education’s Office for Civil Rights is examining complaints thatHarvard and Princeton have discriminated against Asian-American undergraduate applicants, highlighting a concern of many Asian-American parents. The inquiry was first reported by Bloomberg News.<query> do asian americans face bias in admissions</query>"

In [None]:
#functions for preprocessing
special_characters=string.punctuation
special_characters=special_characters+'“”’—'
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()


def spl_chars_removal(lst):
  lst1=list()
  for element in lst:
    str=""
    str="".join([i for i in element if i not in special_characters])
    lst1.append(str)
  return lst1

def stopwords_removal_gensim_custom(lst):
  tokens_without_sw = [word for word in lst if not word in STOPWORDS]
  return tokens_without_sw

def stemming(lst):
  stem_text = [porter_stemmer.stem(word) for word in lst]
  return stem_text

def lemmatizer(lst):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in lst]
  return lemm_text

In [None]:
!mkdir sample_collection_jsonl
!mkdir indexes
!mkdir indexes/sample_collection_jsonl

mkdir: cannot create directory ‘sample_collection_jsonl’: File exists
mkdir: cannot create directory ‘indexes’: File exists
mkdir: cannot create directory ‘indexes/sample_collection_jsonl’: File exists


In [None]:
#creating an input file for indexing without preprocessing
output = jsonlines.open('sample_collection_jsonl/documents.jsonl', 'w')
for vals in qcorpus_list:
    output.write({
        'id': vals['id'],
        'contents': vals['contents']
    })

In [None]:
#creating an input file for indexing with preprocessing
output = jsonlines.open('sample_collection_jsonl/documents.jsonl', 'w')
for k in tqdm(qcorpus_list):
  i=k['contents']
  i=re.sub("[</]query","",i)
  i=i.lower()
  z = i.split(" ")
  z=spl_chars_removal(z)
  z=stopwords_removal_gensim_custom(z)
  z=' '.join(z)
  output.write({'id': k['id'],'contents': z})

100%|██████████| 868655/868655 [03:16<00:00, 4409.73it/s]


In [None]:
!head -10 sample_collection_jsonl/documents.jsonl

{"id": "clueweb12-0000tw-00-14115___1", "contents": "asianamericans face bias admissions elite colleges  nytimescom home page todays paper video popular times topics search nytimescom education world politics education bay area chicago texas ny  region business technology science health sports opinion arts style travel jobs real estate autos february 8 2012 143 pm asianamericans face bias admissions elite colleges daniel e slotnik 608 pm  updated statement princeton added story department educations office civil rights examining complaints thatharvard princeton discriminated asianamerican undergraduate applicants highlighting concern asianamerican parents inquiry reported bloomberg news asian americans face bias admissions"}
{"id": "clueweb12-0000tw-00-14115___10", "contents": "insisting upholding affirmative action college admissions ensure america continues extremely race sensitive society based race asianamericans heart competition makes stronger cares ivy league mission educate bes

In [None]:
#index builder
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input sample_collection_jsonl \
  --index indexes/sample_collection_jsonl \
  --generator DefaultLuceneDocumentGenerator \
  --threads 4 \
  --storePositions --storeDocvectors --storeRaw

In [None]:
!ls /content/indexes/sample_collection_jsonl

_0.fdm	_0_Lucene80_0.dvd  _0_Lucene84_0.tim  _0.nvm  _0.tvx
_0.fdt	_0_Lucene80_0.dvm  _0_Lucene84_0.tip  _0.si   segments_1
_0.fdx	_0_Lucene84_0.doc  _0_Lucene84_0.tmd  _0.tvd  write.lock
_0.fnm	_0_Lucene84_0.pos  _0.nvd	      _0.tvm


In [None]:
#saving the index files to the drive
!cp -r /content/indexes/sample_collection_jsonl /content/drive/MyDrive/Touche

In [None]:
def parse_xml(path):
  answer_list = []
  xmldoc = minidom.parse(path)
  itemlist = xmldoc.getElementsByTagName('topics')
  topic_list = itemlist[0].getElementsByTagName('topic')
  for topic in topic_list:
    tuple_for_add = tuple((topic.getElementsByTagName('number')[0].firstChild.nodeValue, topic.getElementsByTagName('title')[0].firstChild.nodeValue,topic.getElementsByTagName('objects')[0].firstChild.nodeValue))
    answer_list.append(tuple_for_add)
  parsed=pd.DataFrame(answer_list, columns=["Number","Title","Objects"])
  return parsed

In [None]:
topics=parse_xml("/content/drive/MyDrive/Touche/topics-task2.xml")
topics.head()

Unnamed: 0,Number,Title,Objects
0,2,"Which is better, a laptop or a desktop?","laptop, desktop"
1,3,"Which is better, Canon or Nikon?","Canon, Nikon"
2,8,What are the advantages and disadvantages of P...,"PHP, Python"
3,9,Why is Linux better than Windows?,"Linux, Windows"
4,12,Train or plane? Which is the better choice?,"Train, plane"


In [None]:
#creating a dataframe for intial retrieval
from pyserini.search import SimpleSearcher
from pyserini.search.lucene import LuceneSearcher
searcher = LuceneSearcher('/content/indexes/sample_collection_jsonl')
h1=[]
for l,j in tqdm(zip(topics["Title"],topics["Number"])):
  hits = searcher.search(l, k=1000)
  for i in range(len(hits)):
    contents=str.split(hits[i].raw,'"contents" :',)[1]
    contents=contents.replace("}","")
    contents=contents.replace('"',"")
    contents=contents.replace('\n',"")
    h1.append({"title_id":j,"title":l,'doc_id': hits[i].docid,'score': hits[i].score,"content": contents})

  # Value of K determines the number of documents that can be returned.
  print('Number of document matches from the index: '+str(len(hits)))

In [None]:
df=pd.DataFrame(h1)
df.head()


In [None]:
df.to_csv("/content/drive/MyDrive/Touche/relevant_bm25.csv")