# Google Colaboratory, PyTorch GPU, and Package Imports

## Colab Google Drive Mounts and Python Package Install

In [None]:
import os

# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    # If there are packages I need to install separately, do it here
    !pip install -r '../requirements.txt'

    # Mount Google Drive
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    # (IMPORTANT: THIS PATH MUST MATCH EXACTLY TO WHERE THIS NOTEBOOK IS LOCATED
    # IN YOUR GOOGLE DRIVE!!)
    %cd '/content/drive/My Drive/CS646_Final_Project/Baseline BERT-ADA'

    # NVidia APEX install
    %cd apex
    !pip install -v --no-cache-dir ./
    %cd ..

    # List the directory contents
    !ls

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1zyXK0VOQZwuIfSaMkgJTmwtvSpUOQ1zm/CS646_Final_Project/Baseline BERT-ADA
Archive:  apex-master.zip
8a1ed9e8d35dfad26fb973996319965e4224dcdd
   creating: apex-master/
  inflating: apex-master/.gitignore  
  inflating: apex-master/.gitmodules  
 extracting: apex-master/.nojekyll   
  inflating: apex-master/LICENSE     
  inflating: apex-master/README.md   
   creating: apex-master/apex/
   creating: apex-master/apex/RNN/
 extracting: apex-master/apex/RNN/README.md  
  inflating: apex-master/apex/RNN/RNNBackend.py  
  inflating: apex-master/apex/RNN/__init__.py  
  inflating: apex-master/apex/RNN/cells.py  
  inflating: apex-master/apex/RNN/models.py  
  inflating: apex-master/apex/__init__.py  
   creating: apex-m

In [None]:
# CUDA Multi GPU
# import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="1,2,3"

In [None]:
# IPython reloading magic
%load_ext autoreload
%autoreload 2

## Package Imports

In [None]:
import sys
import numpy as np
import torch
import transformers
import datasets

import utils

# Random seed settings
random_seed = 646
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Print version information
print("Python version: " + sys.version)
print("NumPy version: " + np.__version__)
print("PyTorch version: " + torch.__version__)
print("Transformers version: " + transformers.__version__)

Python version: 3.6.9 (default, Oct  8 2020, 12:12:24) 
[GCC 8.4.0]
NumPy version: 1.18.5
PyTorch version: 1.7.0+cu101
Transformers version: 3.4.0


## PyTorch GPU settings

In [None]:
# torch.device / CUDA Setup
use_cuda = True
use_colab_tpu = False
colab_tpu_available = False

if use_colab_tpu:
    try:
        assert os.environ['COLAB_TPU_ADDR']
        colab_tpu_available = True
    except:
        colab_tpu_available = True

if use_cuda and torch.cuda.is_available():
    torch_device = torch.device('cuda')

    # Set this to True to make your output immediately reproducible
    # Note: https://pytorch.org/docs/stable/notes/randomness.html
    torch.backends.cudnn.deterministic = False
    
    # Disable 'benchmark' mode: Set this False if you want to measure running times more fairly
    # Note: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
    torch.backends.cudnn.benchmark = True
    
    # Faster Host to GPU copies with page-locked memory
    use_pin_memory = True 

    # CUDA libraries version information
    print("CUDA Version: " + str(torch.version.cuda))
    print("cuDNN Version: " + str(torch.backends.cudnn.version()))
    print("CUDA Device Name: " + str(torch.cuda.get_device_name()))
    print("CUDA Capabilities: "+ str(torch.cuda.get_device_capability()))

elif use_colab_tpu and colab_tpu_available:
    # This needs to be installed separately
    # https://github.com/pytorch/xla/blob/master/contrib/colab/getting-started.ipynb
    import torch_xla 
    import torch_xla.core.xla_model as xm

    torch_device = xm.xla_device()

else:
    torch_device = torch.device('cpu')
    use_pin_memory = False

CUDA Version: 10.1
cuDNN Version: 7603
CUDA Device Name: Tesla T4
CUDA Capabilities: (7, 5)


# Fine-tune for ATSC

## Load the previously pretrained BERT with a new sequence classification head

In [None]:
model_finetuning = transformers.AutoModelForSequenceClassification.from_pretrained(
    'laptops_and_restaurants_2mio_ep15',
    num_labels=3) # Positive, Negative, Neutral

Some weights of the model checkpoint at laptops_and_restaurants_2mio_ep15 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the mod

In [None]:
tokenizer_finetuning = transformers.AutoTokenizer.from_pretrained(
    'laptops_and_restaurants_2mio_ep15')

## Load the SemEval 2014 dataset

### Laptop

In [None]:
laptop = datasets.load_dataset(
    './dataset_scripts/semeval2014_task4',
    data_files={
        'train': 'semeval_data_files/Laptop_Train_v2.xml',
        'test': 'semeval_data_files/Laptops_Test_Gold.xml'},
    cache_dir='./dataset_cache')

data_laptop_train = laptop['train']
data_laptop_test = laptop['test']

Using custom data configuration default


Downloading and preparing dataset sem_eval2014_task4_dataset/default-16e61ef2dc6af78c (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to ./dataset_cache/sem_eval2014_task4_dataset/default-16e61ef2dc6af78c/0.0.1/87e9c45372082ad462e0ec7fb129016e258a2fa914900241af50e9194846289e...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset sem_eval2014_task4_dataset downloaded and prepared to ./dataset_cache/sem_eval2014_task4_dataset/default-16e61ef2dc6af78c/0.0.1/87e9c45372082ad462e0ec7fb129016e258a2fa914900241af50e9194846289e. Subsequent calls will reuse this data.


In [None]:
print(len(data_laptop_train))
print(len(data_laptop_test))

1462
411


In [None]:
print(data_laptop_train[0])

{'opinions': {'aspect': ['cord', 'battery life'], 'sentiment': [2, 0]}, 'text': 'I charge it at night and skip taking the cord with me because of the good battery life.'}


### Restaurants

In [None]:
restaurants = datasets.load_dataset(
    './dataset_scripts/semeval2014_task4',
    data_files={
        'train': 'semeval_data_files/Restaurants_Train_v2.xml',
        'test': 'semeval_data_files/Restaurants_Test_Gold.xml'},
    cache_dir='./dataset_cache')

data_restaurants_train = restaurants['train']
data_restaurants_test = restaurants['test']

Using custom data configuration default


Downloading and preparing dataset sem_eval2014_task4_dataset/default-f942b4721d08da6a (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to ./dataset_cache/sem_eval2014_task4_dataset/default-f942b4721d08da6a/0.0.1/87e9c45372082ad462e0ec7fb129016e258a2fa914900241af50e9194846289e...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset sem_eval2014_task4_dataset downloaded and prepared to ./dataset_cache/sem_eval2014_task4_dataset/default-f942b4721d08da6a/0.0.1/87e9c45372082ad462e0ec7fb129016e258a2fa914900241af50e9194846289e. Subsequent calls will reuse this data.


In [None]:
print(len(data_restaurants_train))
print(len(data_restaurants_test))

1978
600


In [None]:
print(data_restaurants_train[0])

{'opinions': {'aspect': ['staff'], 'sentiment': [1]}, 'text': 'But the staff was so horrible to us.'}


### Joint Domain dataset: Merge Laptops and Restaurants

In [None]:
data_joint_train = datasets.concatenate_datasets([data_laptop_train, data_restaurants_train])
data_joint_test = datasets.concatenate_datasets([data_laptop_test, data_restaurants_test])

In [None]:
print(len(data_joint_train))
print(len(data_joint_test))

3440
1011


### Preprocessing

In [None]:
def process_data_point(data_point):
    # Since there may be more than one opinion in a single data point,
    # make each of them to be separate data points

    data_points_extended = {
        'input_ids': [],
        'token_type_ids': [],
        'attention_mask': [],
        'label': [],
    }

    for i, opinion in enumerate(data_point['opinions']):
        for j, asp in enumerate(opinion['aspect']):
            encoded = tokenizer_finetuning(
                # Sentence
                data_point['text'][i], 
                # Aspect target
                asp,
                # Truncation: If it's too long, only truncate the original sentence
                truncation='only_first',
                # Padding
                # padding='max_length', max_length=tokenizer_finetuning.max_len)
                padding='max_length', max_length=256)
            
            data_points_extended['input_ids'].append(encoded['input_ids'])
            data_points_extended['token_type_ids'].append(encoded['token_type_ids'])
            data_points_extended['attention_mask'].append(encoded['attention_mask'])
            data_points_extended['label'].append(opinion['sentiment'][j])

    return data_points_extended

In [None]:
# Get new features based on process_data_point() and get rid of the original features

# Resturants only
train_dataset_restaurants = data_restaurants_train.map(
     process_data_point, batched=True, remove_columns=data_restaurants_train.column_names)
test_dataset_restaurants = data_restaurants_test.map(
     process_data_point, batched=True, remove_columns=data_restaurants_test.column_names)

# Laptops only
train_dataset_laptop = data_laptop_train.map(
     process_data_point, batched=True, remove_columns=data_laptop_train.column_names)
test_dataset_laptop = data_laptop_test.map(
     process_data_point, batched=True, remove_columns=data_laptop_test.column_names)

# Joint domain 
#train_dataset_joint = data_joint_train.map(
#    process_data_point, batched=True, remove_columns=data_joint_train.column_names)
#test_dataset_joint = data_joint_test.map(
#    process_data_point, batched=True, remove_columns=data_joint_test.column_names)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




### Train-validation split

In [None]:
# Training set size after validation split
new_train_dataset_size_laptop = int(len(train_dataset_laptop) * 0.8)
new_valid_dataset_size_laptop = len(train_dataset_laptop) - new_train_dataset_size_laptop

print("Training dataset (laptop) after split:", new_train_dataset_size_laptop)
print("Validation dataset (laptop) after split:", new_valid_dataset_size_laptop)

Training dataset (laptop) after split: 1850
Validation dataset (laptop) after split: 463


In [None]:
# Training set size after validation split
new_train_dataset_size_restaurants = int(len(train_dataset_restaurants) * 0.8)
new_valid_dataset_size_restaurants = len(train_dataset_restaurants) - new_train_dataset_size_restaurants

print("Training dataset (restaurants) after split:", new_train_dataset_size_restaurants)
print("Validation dataset (restaurants) after split:", new_valid_dataset_size_restaurants)

Training dataset (restaurants) after split: 2881
Validation dataset (restaurants) after split: 721


In [None]:
train_dataset_laptop_shuffled = train_dataset_laptop.shuffle(seed=random_seed)
train_dataset_restaurants_shuffled = train_dataset_restaurants.shuffle(seed=random_seed)

In [None]:
new_train_dataset_laptop = train_dataset_laptop_shuffled.select(indices=np.arange(new_train_dataset_size_laptop))
new_valid_dataset_laptop = train_dataset_laptop_shuffled.select(indices=np.arange(new_train_dataset_size_laptop, new_train_dataset_size_laptop + new_valid_dataset_size_laptop))

In [None]:
print(len(new_train_dataset_laptop))
print(len(new_valid_dataset_laptop))

1850
463


In [None]:
new_train_dataset_restaurants = train_dataset_restaurants_shuffled.select(indices=np.arange(new_train_dataset_size_restaurants))
new_valid_dataset_restaurants = train_dataset_restaurants_shuffled.select(indices=np.arange(new_train_dataset_size_restaurants, new_train_dataset_size_restaurants + new_valid_dataset_size_restaurants))

In [None]:
print(len(new_train_dataset_restaurants))
print(len(new_valid_dataset_restaurants))

2881
721


In [None]:
# Finally combine the new train/valid set for two domains together
train_split_dataset_finetuning = datasets.concatenate_datasets([new_train_dataset_laptop, new_train_dataset_restaurants])
validation_split_dataset_finetuning = datasets.concatenate_datasets([new_valid_dataset_laptop, new_valid_dataset_restaurants])

In [None]:
# Shuffle the combined dataset too
train_split_dataset_finetuning = train_split_dataset_finetuning.shuffle(seed=random_seed)
validation_split_dataset_finetuning = validation_split_dataset_finetuning.shuffle(seed=random_seed)

In [None]:
print(len(train_split_dataset_finetuning))
print(len(validation_split_dataset_finetuning))

4731
1184


## Fine-tune

### Training settings

In [None]:
training_args_finetuning = transformers.TrainingArguments(
    output_dir='./progress_finetuning/bert_ada/results',
    overwrite_output_dir=True,
    num_train_epochs=7,
    per_device_train_batch_size=32, # Was 8 * 4 GPUs
    per_device_eval_batch_size=32, # Was 8 * 4 GPUs
    warmup_steps=300,
    weight_decay=0.01,
    learning_rate=2e-5,
    evaluate_during_training=True,
    logging_dir='./progress_finetuning/bert_ada/logs',
    logging_steps=100,
    save_steps=400,
    fp16=True,
    fp16_opt_level='O2',
    load_best_model_at_end=True,
    dataloader_num_workers=22,
)



In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true=labels, y_pred=preds, labels=[0,1,2], average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
trainer_finetuning = transformers.Trainer(
    model=model_finetuning,
    args=training_args_finetuning,
    compute_metrics=compute_metrics,
    train_dataset=train_split_dataset_finetuning,
    eval_dataset=validation_split_dataset_finetuning
)

### Training loop

In [None]:
%%time
trainer_finetuning.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,1.098307,0.947593,0.619932,0.405315,0.457022,0.44189
200,0.673575,0.602311,0.772804,0.660418,0.750673,0.6629
300,0.549678,0.530265,0.776182,0.717141,0.718975,0.7248
400,0.413051,0.468967,0.814189,0.766884,0.767665,0.767016
500,0.291895,0.534344,0.829392,0.77229,0.795427,0.764903
600,0.237163,0.54151,0.836993,0.791246,0.806556,0.780853
700,0.14518,0.558321,0.846284,0.800594,0.810295,0.796232
800,0.106559,0.595608,0.844595,0.800837,0.803167,0.799095
900,0.090404,0.611259,0.844595,0.801865,0.80613,0.798149
1000,0.06244,0.635462,0.844595,0.802303,0.806459,0.798637


  _warn_prf(average, modifier, msg_start, len(result))


CPU times: user 8min 4s, sys: 5min 46s, total: 13min 51s
Wall time: 14min 56s


TrainOutput(global_step=1036, training_loss=0.35621351440901)

### Save the model to the local directory

In [None]:
trainer_finetuning.save_model('./trained_models/atsc_bert_ada')

In [None]:
tokenizer_finetuning.save_pretrained('./trained_models/atsc_bert_ada')

('./trained_models/atsc_bert_ada/tokenizer_config.json',
 './trained_models/atsc_bert_ada/special_tokens_map.json',
 './trained_models/atsc_bert_ada/vocab.txt',
 './trained_models/atsc_bert_ada/added_tokens.json')

## Evaluation

### Laptop

In [None]:
trainer_finetuning.evaluate(test_dataset_laptop)

{'epoch': 7.0,
 'eval_accuracy': 0.7711598746081505,
 'eval_f1': 0.7319566543120478,
 'eval_loss': 0.5228157639503479,
 'eval_precision': 0.7275145809278504,
 'eval_recall': 0.7526614204653908,
 'total_flos': 5569228405246464}

### Restaurants

In [None]:
trainer_finetuning.evaluate(test_dataset_restaurants)

{'epoch': 7.0,
 'eval_accuracy': 0.8535714285714285,
 'eval_f1': 0.7819906501451985,
 'eval_loss': 0.3798801302909851,
 'eval_precision': 0.7933680749018416,
 'eval_recall': 0.7745290423861851,
 'total_flos': 5569228405246464}