# Section 1: running the extraction pipeline

## First, clone the OpineDB repos. 

You need to replace "username" and "password" in the http link with you github account and password (and remove it right after you run it for security)

In [0]:
!git clone https://github.com/rit-git/opinedb_public.git

## Download BERT and other packages

In [0]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip 
!unzip uncased_L-12_H-768_A-12.zip
%cd opinedb_public/extractor

In [0]:
!python -m spacy download en_core_web_sm
!python -m nltk.downloader punkt
!python -m nltk.downloader stopwords
!pip install jsonlines
!sudo apt-get install jq

## Train the models

In [0]:
# classification
!python code/train_classifier.py \
  ../../uncased_L-12_H-768_A-12 \
  models/classification/hotel_data \
  models/classification/hotel

In [0]:
# tagging
!python code/train_tagging.py \
  ../../uncased_L-12_H-768_A-12 \
  models/tagging/hotel_data \
  models/tagging/hotel

In [0]:
# pairing
!python code/bert/run_classifier.py \
   --task_name=mrpc \
   --do_train=True \
   --do_eval=True \
   --do_predict=False \
   --data_dir=models/pairing/data \
   --vocab_file=../../uncased_L-12_H-768_A-12/vocab.txt \
   --bert_config_file=../../uncased_L-12_H-768_A-12/bert_config.json \
   --init_checkpoint=../../uncased_L-12_H-768_A-12/bert_model.ckpt \
   --max_seq_length=128 \
   --train_batch_size=16 \
   --learning_rate=2e-5 \
   --num_train_epochs=10.0 \
   --output_dir=models/pairing/model

## Now you can run the extractor (and output to Google Drive):

In [0]:
from google.colab import drive
drive.mount('/content/gdrive') # This will ask you to login to your Google account

#### Upload the raw_reviews.csv file to Google Drive. I put it in a folder called "extraction_data" under the root dir

In [0]:
!ls "/content/gdrive/My Drive/extraction_data/"
# this should contain the raw_reviews.csv file

In [0]:
import json

config = {
  "input_dir_path" : ".",
  "bert_path" : "../../uncased_L-12_H-768_A-12",
  "tagging_path" : "models/tagging/hotel",
  "pairing_path" : "models/pairing/model",
  "classifier_path" : "models/classification/hotel",
  "output_path" : "/content/gdrive/My\ Drive/extraction_data/london_reviews_with_extractions.json" # specify the output path
}

json.dump(config, open('config.json', 'w'))
!cp "/content/gdrive/My Drive/extraction_data/raw_reviews.csv" .
!make clean
!make

# Section 2: Running the extractor experiments

### Repeat the training process 10 times on each dataset. Output the F1 scores to ``*/significance-test.txt``

In [0]:
%cd /content/opinedb_public/extractor

In [0]:
import os

datasets = ['hotel', 'semeval14-laptop', 'semeval14-restaurant', 'semeval15-restaurant']
reps = 10


for dataset in datasets:
  for _ in range(reps):
    data_path = 'data/experiments/' + dataset
    model_path = 'data/experiments/' + dataset + '_output'
    bert_path = '../../uncased_L-12_H-768_A-12'
    !mkdir -p $model_path
    !python3 code/BERT-BiLSTM-CRF-NER/bert_lstm_ner.py \
      --task_name=NER  \
      --do_train=True   \
      --do_eval=False   \
      --do_predict=True \
      --data_dir=$data_path   \
      --vocab_file=$bert_path/vocab.txt \
      --bert_config_file=$bert_path/bert_config.json \
      --init_checkpoint=$bert_path/bert_model.ckpt \
      --max_seq_length=64   \
      --train_batch_size=32   \
      --learning_rate=2e-5   \
      --num_train_epochs=30.0   \
      --output_dir=$model_path
    
    !cat $model_path/entity_level_predicted_result.txt >> $model_path/significance-test.txt
