# Create the training dataset for Siamese (Cosine)

## Google Colab setups

This part only gets executed if this notebook is being run under Google Colab. **Please change the working path  directory below in advance!**

In [1]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    # If there are packages I need to install separately, do it here
    #!pip install pyserini==0.9.4.0

    # Mount Google Drive
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    # (IMPORTANT: THIS PATH MUST MATCH EXACTLY TO WHERE THIS NOTEBOOK IS LOCATED
    # IN YOUR GOOGLE DRIVE!!)
    %cd '/content/drive/My Drive/CS646_Final_Project/data'

    # List the directory contents
    !ls

## Import packages

In [2]:
import os
import random
import json
import pathlib

from tqdm import tqdm
import xml.etree.ElementTree as ET

In [3]:
# Random seed settings
random_seed = 646
random.seed(random_seed) # Python

## Path settings

In [4]:
semeval_path = os.path.join('.', 'SemEval2014_Task4')
our_path = os.path.join('.', 'our_datasets_partially_correct_labels_cosine')

In [5]:
pathlib.Path(our_path).mkdir(parents=True, exist_ok=True)

In [6]:
semeval_files = os.listdir(semeval_path)
print(semeval_files)

['PLACE_SEMEVAL_DATASET_FILES_HERE', 'Laptop_Train_v2.xml', 'Restaurants_Train_v2.xml', 'Laptops_Test_Gold.xml', 'Restaurants_Test_Gold.xml']


In [7]:
new_files = {
   'Laptop_Train_v2.xml': 'laptop_train.json',
   'Restaurants_Train_v2.xml': 'restaurant_train.json',
}

## Process SemEval data files

In [8]:
for f in new_files.keys():
    print("Processing", f)

    file_path = os.path.join(semeval_path, f)
    save_path = os.path.join(our_path, new_files[f])

    # find unique queries
    with open(file_path) as semeval_file:
        sentence_elements = ET.parse(semeval_file).getroot().iter('sentence')

        all_queries = []

        for id_, s in enumerate(sentence_elements):
            for o in s.iter('aspectTerm'):
                aspect = o.get('term')
                sentiment = o.get('polarity')

                if sentiment != 'conflict': 
                    all_queries.append((aspect,sentiment))

        queries = list(set(all_queries))
        queries.sort()

        print('found {}/{} unique queries.'.format(len(queries), len(all_queries)))

    # find document, (aspect, sentiment) pairs (documents and their relevance judgements)
    with open(file_path) as semeval_file:
        sentence_elements = ET.parse(semeval_file).getroot().iter('sentence')

        all_docs = []
    
        for doc_id, s in enumerate(sentence_elements): 
            doc = s.find('text').text

            as_pairs = []

            for o in s.iter('aspectTerm'):
                aspect = o.get('term')
                sentiment = o.get('polarity')

                if sentiment == 'conflict':
                    continue 

                as_pairs.append((aspect, sentiment))
                
            as_pairs = list(set(as_pairs))
            as_pairs.sort()

            all_docs.append({'doc': doc, 'pairs': as_pairs})

        print('found {} documents.'.format(len(all_docs)))

    examples = []

    num_pos = 0
    num_aspect_only = 0
    num_sentiment_only = 0
    num_neg = 0 

    # create examples for all unique queries and docs 
    for query_id, query in enumerate(queries):
        # print('current unique query: {}'.format(query))
        for doc_id, doc_ex in enumerate(all_docs):
            # print(cur_qs)

            # If `query = (aspect, sentiment)` is in one of the [(aspect, sentiment)] labels
            for aspect_sentiment in doc_ex['pairs']:
                if query[0] == aspect_sentiment[0] and query[1] == aspect_sentiment[1]:
                    label = 1 # This is a relevant document
                    num_pos += 1
                elif query[0] == aspect_sentiment[0]: # If only the aspect label matches
                    label = 0.7
                    num_aspect_only += 1
                elif query[1] == aspect_sentiment[1]: # Sentiment label only
                    label = 0.2
                    num_sentiment_only += 1
                else: # Both labels not matching
                    label = 0
                    num_neg += 1
        
                example = {
                    'query_id': query_id,
                    'query': query,
                    'doc_id': doc_id,
                    'doc': doc_ex['doc'],
                    'label': label
                }

                examples.append(example)

    print('generated {} examples, {} positive {} aspect only, {} sentiment only, {} negative.'.format(len(examples), num_pos, num_aspect_only, num_sentiment_only, num_neg))

    # sample negative examples to get more even distribution
    final_examples = []

    new_num_aspect_only = 0
    new_num_sentiment_only = 0
    new_num_neg = 0

    for example in examples:
        if example['label'] == 1:
            final_examples.append(example)
        else:

            if example['label'] == 0.7:
                sample_rate = int(num_aspect_only/num_pos)

                if sample_rate <= 1:
                    final_examples.append(example)
                    new_num_aspect_only += 1
                else:
                    if random.randint(0, sample_rate-1) != (sample_rate-1):
                        continue

                    final_examples.append(example)
                    new_num_aspect_only += 1

            elif example['label'] == 0.2:
                sample_rate = int(num_sentiment_only/num_pos)

                if sample_rate <= 1:
                    final_examples.append(example)
                    new_num_sentiment_only += 1
                else:
                    if random.randint(0, sample_rate-1) != (sample_rate-1):
                        continue

                    final_examples.append(example)
                    new_num_sentiment_only += 1

            else:
                sample_rate = int(num_neg/num_pos)

                if sample_rate <= 1:
                    final_examples.append(example)
                    new_num_neg += 1
                else:
                    if random.randint(0, sample_rate-1) != (sample_rate-1):
                        continue

                    final_examples.append(example)
                    new_num_neg += 1

    print('final generated {} examples, {} positive {} aspect only, {} sentiment only, {} negative.'.format(len(final_examples), num_pos, new_num_aspect_only, new_num_sentiment_only, new_num_neg))

    # write file
    print('saving to: {}.'.format(save_path))
    
    with open(save_path, 'w') as fout:
        json.dump(final_examples , fout)
    
    print()

Processing Laptop_Train_v2.xml
found 1250/2313 unique queries.
found 3045 documents.
generated 2848750 examples, 2279 positive 2107 aspect only, 992013 sentiment only, 1852351 negative.
final generated 8940 examples, 2279 positive 2107 aspect only, 2275 sentiment only, 2279 negative.
saving to: ./our_datasets_partially_correct_labels_cosine/laptop_train.json.

Processing Restaurants_Train_v2.xml
found 1560/3602 unique queries.
found 3041 documents.
generated 5584800 examples, 3580 positive 4205 aspect only, 2390610 sentiment only, 3186405 negative.
final generated 14884 examples, 3580 positive 4205 aspect only, 3567 sentiment only, 3532 negative.
saving to: ./our_datasets_partially_correct_labels_cosine/restaurant_train.json.



In [9]:
final_examples[0]

{'query_id': 0,
 'query': ("'gourmet' Indian cuisine", 'neutral'),
 'doc_id': 103,
 'doc': 'The atmosphere is unheralded, the service impecible, and the food magnificant.',
 'label': 0}