# Create the training dataset for Siamese (Contrastive / Online Contrastive)

## Google Colab setups

This part only gets executed if this notebook is being run under Google Colab. **Please change the working path  directory below in advance!**

In [1]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    # If there are packages I need to install separately, do it here
    #!pip install pyserini==0.9.4.0

    # Mount Google Drive
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    # (IMPORTANT: THIS PATH MUST MATCH EXACTLY TO WHERE THIS NOTEBOOK IS LOCATED
    # IN YOUR GOOGLE DRIVE!!)
    %cd '/content/drive/My Drive/CS646_Final_Project/data'

    # List the directory contents
    !ls

## Import packages

In [2]:
import os
import json 
import random
import pathlib

from tqdm import tqdm
import xml.etree.ElementTree as ET

In [3]:
# Random seed settings
random_seed = 646
random.seed(random_seed) # Python

## Path settings

In [4]:
semeval_path = os.path.join('.', 'SemEval2014_Task4')
our_path = os.path.join('.', 'our_datasets')

In [5]:
pathlib.Path(our_path).mkdir(parents=True, exist_ok=True)

In [6]:
files = os.listdir(semeval_path)
files

['PLACE_SEMEVAL_DATASET_FILES_HERE',
 'Laptop_Train_v2.xml',
 'Restaurants_Train_v2.xml',
 'Laptops_Test_Gold.xml',
 'Restaurants_Test_Gold.xml']

In [7]:
new_files = {
   'Laptop_Train_v2.xml': 'laptop_train.json',
   'Restaurants_Train_v2.xml': 'restaurant_train.json'
}

## Process SemEval data files

In [8]:
for file in new_files.keys():
    filepath = os.path.join(semeval_path, file)
    savepath = os.path.join(our_path, new_files[file])

    # find unique queries
    with open(filepath) as semeval_file:
        sentence_elements = ET.parse(semeval_file).getroot().iter('sentence')

        all_queries = []

        for id_, s in enumerate(sentence_elements):
            for o in s.iter('aspectTerm'):
                aspect = o.get('term')
                sentiment = o.get('polarity')

                if sentiment != 'conflict': 
                    all_queries.append((aspect,sentiment))

        queries = list(set(all_queries))
        queries.sort()
        print('found {}/{} unique queries.'.format(len(queries), len(all_queries)))

    # find document, (aspect, sentiment) pairs
    with open(filepath) as semeval_file:
        sentence_elements = ET.parse(semeval_file).getroot().iter('sentence')

        all_docs = []

        for doc_id, s in enumerate(sentence_elements): 
            doc = s.find('text').text

            as_pairs = []
      
            for o in s.iter('aspectTerm'):
                aspect = o.get('term')
                sentiment = o.get('polarity')

                if sentiment == 'conflict':
                    continue 

                as_pairs.append((aspect, sentiment))
                
            as_pairs = list(set(as_pairs))
            as_pairs.sort()

            all_docs.append({
              'doc': doc,
              'pairs': as_pairs,
            })

        print('found {} documents.'.format(len(all_docs)))

    # create examples for all unique queries and docs
    examples = []
    num_pos = 0 
    num_neg = 0 
 
    for query_id, query in enumerate(queries):
        for doc_id, doc_ex in enumerate(all_docs):

            if query in doc_ex['pairs']:
                label = 1
                num_pos += 1
            else:
                label = 0
                num_neg += 1

            example = {
                'query_id': query_id,
                'query': query,
                'doc_id': doc_id,
                'doc': doc_ex['doc'],
                'label': label
            }

            examples.append(example)

    print('generated {} examples, {} positive {} negative.'.format(len(examples), num_pos, num_neg))

    # Resampling
    final_examples = []

    sample_rate = int(num_neg/num_pos)

    # sample negative examples to get more even distribution
    new_num_neg = 0

    for example in examples:
        if example['label'] == 1:
            final_examples.append(example)
        else:
            if random.randint(0, sample_rate-1) != (sample_rate-1):
                continue

            final_examples.append(example)

            new_num_neg += 1

    print('final generated {} examples, {} positive {} negative.'.format(len(final_examples), num_pos, new_num_neg))

    # write file
    print('saving to: {}.'.format(savepath))

    with open(savepath, 'w') as fout:
        json.dump(final_examples , fout)
        
    print()

found 1250/2313 unique queries.
found 3045 documents.
generated 3806250 examples, 2279 positive 3803971 negative.
final generated 4545 examples, 2279 positive 2266 negative.
saving to: ./our_datasets/laptop_train.json.

found 1560/3602 unique queries.
found 3041 documents.
generated 4743960 examples, 3580 positive 4740380 negative.
final generated 7102 examples, 3580 positive 3522 negative.
saving to: ./our_datasets/restaurant_train.json.



In [9]:
final_examples[0]

{'query_id': 0,
 'query': ("'gourmet' Indian cuisine", 'neutral'),
 'doc_id': 798,
 'doc': "I went to Ruby Foo's after work with a group of 6.",
 'label': 0}