# Build `pyserini` Index for SemEval 2014 Task 4

- We first convert the original `xml` dataset files into the document collection of `jsonl` format that `pyserini` understands.

- We build a `Pyserini` index that includes all documents from train and test sets for both `laptop` and `restaurant`.

- Lastly, generate `test_queries_{laptop, restaurant}.txt` and `test_qrels_{laptop, restaurant}.txt` out of the original test dataset, by treating each unique `(aspect, sentiment)` label as a query.

## Google Colab setups

This part only gets executed if this notebook is being run under Google Colab. **Please change the working path  directory below in advance!**

In [1]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    # If there are packages I need to install separately, do it here
    !pip install pyserini==0.9.4.0 jsonlines==1.2.0

    # Mount Google Drive
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    # (IMPORTANT: THIS PATH MUST MATCH EXACTLY TO WHERE THIS NOTEBOOK IS LOCATED
    # IN YOUR GOOGLE DRIVE!!)
    %cd '/content/drive/My Drive/CS646_Final_Project/BM25'

    # List the directory contents
    !ls

## Import packages

In [2]:
import os
import json 
import random
import xml.etree.ElementTree as ET

from tqdm import tqdm
import jsonlines 
from pyserini.search import SimpleSearcher

## Path setups

In [3]:
semeval_path = os.path.join('..', 'data', 'SemEval2014_Task4')
collection_path = 'collection'
index_path = 'index'

In [4]:
new_collection_files = {
   'Laptop_Train_v2.xml': 'laptop_train.jsonl',
   'Laptops_Test_Gold.xml': 'laptop_test.jsonl',
   'Restaurants_Test_Gold.xml': 'restaurant_test.jsonl',
   'Restaurants_Train_v2.xml': 'restaurant_train.jsonl'
}

In [5]:
new_query_files = {
   'Laptops_Test_Gold.xml': 'test_queries_laptop.txt',
   'Restaurants_Test_Gold.xml': 'test_queries_restaurant.txt',
}

In [6]:
new_qrels_files = {
   'Laptops_Test_Gold.xml': 'test_qrels_laptop.txt',
   'Restaurants_Test_Gold.xml': 'test_qrels_restaurant.txt',
}

## Create collection 

In [9]:
for file_id, f in enumerate(new_collection_files.keys()):
    
    file_path = os.path.join(semeval_path, f)
  
    save_path = os.path.join(collection_path, new_collection_files[f])
  
    print(save_path)

    with open(file_path) as semeval_file:
        sentence_elements = ET.parse(semeval_file).getroot().iter('sentence')

        for id_, s in enumerate(sentence_elements):
            sent = s.find('text').text

            doc = {
                'id': 'doc' + str(file_id) + str(id_),
                'contents': sent,
            }

            with jsonlines.open(save_path, mode='a') as writer:
                writer.write(doc)

../data/SemEval2014_Task4/Laptop_Train_v2.xml
../data/SemEval2014_Task4/Laptops_Test_Gold.xml
../data/SemEval2014_Task4/Restaurants_Test_Gold.xml
../data/SemEval2014_Task4/Restaurants_Train_v2.xml


## Create `pyserini` index

In [10]:
!python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
 -threads 1 -input collection \
 -index index -storePositions -storeDocvectors -storeRaw

2020-12-06 14:17:02,379 INFO  [main] index.IndexCollection (IndexCollection.java:636) - Setting log level to INFO
2020-12-06 14:17:02,407 INFO  [main] index.IndexCollection (IndexCollection.java:639) - Starting indexer...
2020-12-06 14:17:02,410 INFO  [main] index.IndexCollection (IndexCollection.java:641) - DocumentCollection path: collection
2020-12-06 14:17:02,413 INFO  [main] index.IndexCollection (IndexCollection.java:642) - CollectionClass: JsonCollection
2020-12-06 14:17:02,415 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Generator: DefaultLuceneDocumentGenerator
2020-12-06 14:17:02,417 INFO  [main] index.IndexCollection (IndexCollection.java:644) - Threads: 1
2020-12-06 14:17:02,420 INFO  [main] index.IndexCollection (IndexCollection.java:645) - Stemmer: porter
2020-12-06 14:17:02,421 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Keep stopwords? false
2020-12-06 14:17:02,422 INFO  [main] index.IndexCollection (IndexCollection.java:647) - Sto

## Test the new index

In [11]:
from pyserini.search import SimpleSearcher

idx_path = os.path.join('index')

searcher = SimpleSearcher(idx_path)
hits = searcher.search('Boot time, positive')

for i in range(len(hits)):
    print(f'{i+1:2} {hits[i].docid:15} {hits[i].score:.5f}')

 1 doc10           4.41670
 2 doc31766        4.16470
 3 doc0562         3.75720
 4 doc31130        3.62740
 5 doc1625         3.48300
 6 doc1523         3.38260
 7 doc31974        3.38260
 8 doc2255         3.34980
 9 doc02921        3.30620
10 doc3393         3.28690


## Create test queries from the test dataset

In [22]:
for file_id, f in enumerate(new_query_files.keys()):
    
    file_path = os.path.join(semeval_path, f)
  
    save_path = new_query_files[f]
  
    print(save_path)
    
    queries = []

    with open(file_path) as semeval_file:
        sentence_elements = ET.parse(semeval_file).getroot().iter('sentence')

        for id_, s in enumerate(sentence_elements):
            sent = s.find('text').text
            
            for o in s.iter('aspectTerm'):
                aspect_term = o.get('term')
                sentiment = o.get('polarity')
                
                # Ignore "conflict" labels
                if sentiment != 'conflict':
                    queries.append(aspect_term + ', ' + sentiment)
                    
    print("Total number of queries:", len(queries))

    # WARNING: This makes the queries to be stored in a random order.
    unique_queries = set(queries)

    print("Total number of unique queries:", len(unique_queries))
    
    print()

    with open(save_path, 'w') as new_file:
        for q in unique_queries:
            new_file.write("%s\n" % q)

test_queries_laptop.txt
Total number of queries: 638
Total number of unique queries: 475

test_queries_restaurant.txt
Total number of queries: 1120
Total number of unique queries: 642



## Create qrels.txt (ground truth) for test queries

In [27]:
for file_id, f in enumerate(new_qrels_files.keys()):
    
    file_path = os.path.join(semeval_path, f)
    
    query_path = new_query_files[f]
  
    save_path = new_qrels_files[f]
  
    print(save_path)
    
    with open(query_path, 'r') as test_query_file:
        unique_queries = test_query_file.readlines()

    rel_docIDs = {}
    
    for j in range(len(unique_queries)):
        rel_docIDs[j] = []

    with open(file_path) as semeval_file:
        sentence_elements = ET.parse(semeval_file).getroot().iter('sentence')

        for id_, s in enumerate(sentence_elements):
            # doc_id used in our Pyserini index
            doc_id = 'doc' + str(file_id) + str(id_)
            
            for o in s.iter('aspectTerm'):
                aspect_term = o.get('term')
                sentiment = o.get('polarity')

                # Ignore "conflict" labels
                if sentiment != 'conflict':
                    for i, query in enumerate(unique_queries):
                        query_split = query.split(',')

                        if query_split[0] == aspect_term and query_split[1].strip() == sentiment:
                            rel_docIDs[i].append(doc_id)

    # write query/relevant doc pairs to qrels file
    with open(save_path, 'w') as f:
        for i in rel_docIDs.keys():
            for rd in rel_docIDs[i]:
                line = str(i+1) + '\t' + '0' + '\t' + rd + '\t' + '1'
                f.write("%s\n" % line)

test_qrels_laptop.txt
test_qrels_restaurant.txt


In [None]:
# loop through all unique queries
#   loop through all document json files and keep track of relevant ones
#   write to qrels file [query #, 0, docID, 1]

qrels_path = os.path.join('../', 'BM25', 'qrels_restaurant.txt')

# remove if already exist because appending
!rm ../BM25/qrels_restaurant.txt

for i, query in enumerate(unique_queries):
  rel_docIDs = []

  for file in all_files:
    filepath = os.path.join(collection_path, file)

    with jsonlines.open(filepath) as reader:
      for obj in reader:
        docID = obj['id']

        for asp in obj['aspects']:
          q = asp[0] + ', ' + asp[1]

          # if aspect sent pair matches query, add docID to relevant doc list
          if query == q:
            rel_docIDs.append(docID)
  
  rel_docIDs = set(rel_docIDs)

  # write query/relevant doc pairs to qrels file
  with open(qrels_path, 'a') as f:
    for rd in rel_docIDs:
      line = str(i+1) + '\t' + '0' + '\t' + rd + '\t' + '1'
      f.write("%s\n" % line)

rm: cannot remove '../BM25/qrels_restaurant.txt': No such file or directory
