# Build `pyserini` Index for SemEval 2014 Task 4

- We first convert the original `xml` dataset files into the document collection of `jsonl` format that `pyserini` understands.

- We then build `test_queries_{laptop, restaurant}.txt` and `test_qrels_{laptop, restaurant}.txt` out of the original test dataset, by treating each unique `(aspect, sentiment)` label as a query.

In [1]:
import os

# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    # If there are packages I need to install separately, do it here
    !pip install pyserini==0.9.4.0 jsonlines==1.2.0

    # Mount Google Drive
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    # (IMPORTANT: THIS PATH MUST MATCH EXACTLY TO WHERE THIS NOTEBOOK IS LOCATED
    # IN YOUR GOOGLE DRIVE!!)
    %cd '/content/drive/My Drive/CS646_Final_Project/BM25'

    # List the directory contents
    !ls

Collecting pyserini==0.9.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/a9/ae/cf4a44fbeb6e1f947dfde40fa0142de33ae263333bda26edca914be6ba1b/pyserini-0.9.4.0-py3-none-any.whl (60.4MB)
[K     |████████████████████████████████| 60.4MB 69kB/s 
Collecting pyjnius
[?25l  Downloading https://files.pythonhosted.org/packages/d8/50/098cb5fb76fb7c7d99d403226a2a63dcbfb5c129b71b7d0f5200b05de1f0/pyjnius-1.3.0-cp36-cp36m-manylinux2010_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 44.7MB/s 
Installing collected packages: pyjnius, pyserini
Successfully installed pyjnius-1.3.0 pyserini-0.9.4.0
Collecting jsonlines
  Downloading https://files.pythonhosted.org/packages/4f/9a/ab96291470e305504aa4b7a2e0ec132e930da89eb3ca7a82fbe03167c131/jsonlines-1.2.0-py2.py3-none-any.whl
Installing collected packages: jsonlines
Successfully installed jsonlines-1.2.0


In [3]:
import os
import json 
import random
import xml.etree.ElementTree as ET

from tqdm import tqdm
import jsonlines 
from pyserini.search import SimpleSearcher

In [4]:
semeval_path = os.path.join('..', 'Data', 'SemEval2014_Task4')
collection_path = 'collection'
index_path = 'index'

In [None]:
semeval_files = os.listdir(semeval_path)
print(semeval_files)

['Laptop_Train_v2.xml', 'Laptops_Test_Gold.xml', 'Restaurants_Test_Gold.xml', 'Restaurants_Train_v2.xml']


In [None]:
new_files = {
   'Laptop_Train_v2.xml': 'laptop_train.jsonl',
   'Laptops_Test_Gold.xml': 'laptop_test.jsonl',
   'Restaurants_Test_Gold.xml': 'restaurant_test.jsonl',
   'Restaurants_Train_v2.xml': 'restaurant_train.jsonl'
}

## create collection 

In [None]:
file_id = 0

for f in semeval_files:
    file_path = os.path.join(semeval_path, f)
  
    save_path = os.path.join(collection_path, new_files[f])
  
    print(file_path)
  
    file_id += 1 

    with open(file_path) as semeval_file:
        sentence_elements = ET.parse(semeval_file).getroot().iter('sentence')

        for id_, s in enumerate(sentence_elements):
            sent = s.find('text').text

            doc = {
                'id': 'doc' + str(file_id) + str(id_),
                'contents': sent,
            }

            with jsonlines.open(save_path, mode='a') as writer:
                writer.write(doc)

../Data/SemEval2014_Task4/Laptop_Train_v2.xml


FileNotFoundError: ignored

## create pyserini index

In [7]:
!ls ./indexes

all_index	   lap_test_index   rest_index	      test_index
index_no_metadata  lap_train_index  rest_test_index   train_index
lap_index	   merged_indexes   rest_train_index


In [None]:
!python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
 -threads 1 -input ./collection_no_metadata/ \
 -index indexes/index_no_metadata -storePositions -storeDocvectors -storeRaw

2020-11-25 02:21:35,648 INFO  [main] index.IndexCollection (IndexCollection.java:636) - Setting log level to INFO
2020-11-25 02:21:35,652 INFO  [main] index.IndexCollection (IndexCollection.java:639) - Starting indexer...
2020-11-25 02:21:35,653 INFO  [main] index.IndexCollection (IndexCollection.java:641) - DocumentCollection path: ./collection_no_metadata/
2020-11-25 02:21:35,653 INFO  [main] index.IndexCollection (IndexCollection.java:642) - CollectionClass: JsonCollection
2020-11-25 02:21:35,654 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Generator: DefaultLuceneDocumentGenerator
2020-11-25 02:21:35,654 INFO  [main] index.IndexCollection (IndexCollection.java:644) - Threads: 1
2020-11-25 02:21:35,655 INFO  [main] index.IndexCollection (IndexCollection.java:645) - Stemmer: porter
2020-11-25 02:21:35,655 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Keep stopwords? false
2020-11-25 02:21:35,656 INFO  [main] index.IndexCollection (IndexCollection.

## test that it worked

In [5]:
from pyserini.search import SimpleSearcher

idx_path = os.path.join('indexes', 'index_no_metadata')

searcher = SimpleSearcher(idx_path)
hits = searcher.search('Boot time, positive')

for i in range(len(hits)):
    print(f'{i+1:2} {hits[i].docid:15} {hits[i].score:.5f}')

 1 doc20           4.41670
 2 doc41766        4.16470
 3 doc1562         3.75720
 4 doc41130        3.62740
 5 doc2625         3.48300
 6 doc2523         3.38260
 7 doc41974        3.38260
 8 doc3255         3.34980
 9 doc12921        3.30620
10 doc4393         3.28690


In [7]:
idx_path = os.path.join('indexes', 'index_no_metadata')
queries_path = os.path.join('train_queries_restaurant.txt')
result_path = os.path.join('train_query_results_restaurant.txt')

In [8]:
# remove if exist because appending
!rm ./train_query_results_restaurant.txt

with open(queries_path) as f:
  q_num = 0
  for query in f:
    q_num += 1
    hits = searcher.search(q=query, k=1000)

    for i in range(len(hits)):
      line = str(q_num) + ' Q0 ' + hits[i].docid + ' ' + str(i+1) + ' ' + '%.8f' % hits[i].score + ' galago'
      
      with open(result_path, 'a') as f:
        f.write("%s\n" % line)
    

rm: cannot remove './train_query_results_restaurant.txt': No such file or directory


In [9]:
queries_path = os.path.join('train_queries_laptop.txt')
result_path = os.path.join('train_query_results_laptop.txt')

# remove if exist because appending
!rm ./train_query_results_laptop.txt

with open(queries_path) as f:
  q_num = 0
  for query in f:
    q_num += 1
    hits = searcher.search(q=query, k=1000)

    for i in range(len(hits)):
      line = str(q_num) + ' Q0 ' + hits[i].docid + ' ' + str(i+1) + ' ' + '%.8f' % hits[i].score + ' galago'
      
      with open(result_path, 'a') as f:
        f.write("%s\n" % line)
    

rm: cannot remove './train_query_results_laptop.txt': No such file or directory


## create unique queries from test files

In [None]:
# loop through laptop and restaurant test files to get unique queries
#   write to file to be read by pyserini at eval time: aspect, sentiment

query_path = os.path.join('../', 'BM25', 'test_queries_restaurant.txt')

queries = []

file_path = os.path.join(collection_path, 'restaurant_test.jsonl')

with jsonlines.open(filepath) as reader:
    for obj in reader:
        for asp in obj['aspects']:
            q = asp[0] + ', ' + asp[1]
            queries.append(q)

unique_queries = set(queries)

with open(query_path, 'w') as f:
    for q in unique_queries:
        f.write("%s\n" % q)

## create qrels.txt (ground truth) for test queries

In [None]:
# loop through all unique queries
#   loop through all document json files and keep track of relevant ones
#   write to qrels file [query #, 0, docID, 1]

qrels_path = os.path.join('../', 'BM25', 'qrels_restaurant.txt')

# remove if already exist because appending
!rm ../BM25/qrels_restaurant.txt

for i, query in enumerate(unique_queries):
  rel_docIDs = []

  for file in all_files:
    filepath = os.path.join(collection_path, file)

    with jsonlines.open(filepath) as reader:
      for obj in reader:
        docID = obj['id']

        for asp in obj['aspects']:
          q = asp[0] + ', ' + asp[1]

          # if aspect sent pair matches query, add docID to relevant doc list
          if query == q:
            rel_docIDs.append(docID)
  
  rel_docIDs = set(rel_docIDs)

  # write query/relevant doc pairs to qrels file
  with open(qrels_path, 'a') as f:
    for rd in rel_docIDs:
      line = str(i+1) + '\t' + '0' + '\t' + rd + '\t' + '1'
      f.write("%s\n" % line)

rm: cannot remove '../BM25/qrels_restaurant.txt': No such file or directory


## Create unique queries for training set

In [None]:
query_path = os.path.join('train_queries_laptop.txt')
qrels_path = os.path.join('train_qrels_laptop.txt')

queries = []
qrels = []

train_data_path = os.path.join('..', 'Data', 'our_datasets', 'laptop_train.json')

train_data = json.load(open(train_data_path))

for j in train_data:
    queries.append(j['query'][0] + ', ' + j['query'][1])
    
    if j['label'] == 1:
        qrels.append(str(j['query_id']+1) + '\t' + '0' + '\t' + 'doc' + str(j['doc_id']) + '\t' + '1')

unique_queries = set(queries)

with open(query_path, 'w') as f:
    for q in unique_queries:
        f.write("%s\n" % q)

with open(qrels_path, 'w') as f:
    for r in qrels:
        f.write(r + "\n")

In [5]:
query_path = os.path.join('train_queries_restaurant.txt')
qrels_path = os.path.join('train_qrels_restaurant.txt')

queries = []
qrels = []

train_data_path = os.path.join('..', 'Data', 'our_datasets', 'restaurant_train.json')

train_data = json.load(open(train_data_path))

for j in train_data:
    queries.append(j['query'][0] + ', ' + j['query'][1])
    
    if j['label'] == 1:
        qrels.append(str(j['query_id']+1) + '\t' + '0' + '\t' + 'doc' + str(j['doc_id']) + '\t' + '1')

unique_queries = set(queries)

with open(query_path, 'w') as f:
    for q in unique_queries:
        f.write("%s\n" % q)

with open(qrels_path, 'w') as f:
    for r in qrels:
        f.write(r + "\n")