# Dependencies and Intallation

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install sacrebleu

In [None]:
!pip install sentencepiece

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd 'drive/My Drive/SymbolicMath/'

/content/drive/My Drive/SymbolicMath


In [3]:
from torch.utils.data import DataLoader
from functools import partial
from transformers.models.gpt2.modeling_gpt2 import GPT2Model
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import datasets
import random
import pandas as pd
from IPython.display import display, HTML
from datasets import Dataset
import pandas as pd
from logging import getLogger
import torch
import os
from datasets import load_dataset, load_metric
import csv
import io
import numpy as np
import sympy as sp
import torch
import random
import sys 
from src.utils import AttrDict
from datasets import load_dataset, load_metric
import sentencepiece
from transformers.models.bert.modeling_bert import BertLayer

# Set up and Loading the Data

In [4]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [5]:
params = params = AttrDict({

    # environment parameters
    'env_name': 'char_sp',
    'int_base': 10,
    'balanced': False,
    'positive': True,
    'precision': 10,
    'n_variables': 1,
    'n_coefficients': 0,
    'leaf_probs': '0.75,0,0.25,0',
    'max_len': 512,
    'max_int': 5,
    'max_ops': 15,
    'max_ops_G': 15,
    'clean_prefix_expr': True,
    'rewrite_functions': '',
    'tasks': 'prim_fwd',
    'operators': 'add:10,sub:3,mul:10,div:5,sqrt:4,pow2:4,pow3:2,pow4:1,pow5:1,ln:4,exp:4,sin:4,cos:4,tan:4,asin:1,acos:1,atan:1,sinh:1,cosh:1,tanh:1,asinh:1,acosh:1,atanh:1',
})

In [6]:
from src.envs import build_env
env = build_env(params)         

In [7]:
def read_data(path, number_of_samples):
  with io.open(path, mode='r', encoding='utf-8') as f:
    head = [next(f) for x in range(number_of_samples)]
    lines = [line.rstrip().split('|') for line in head]
    data = [xy.split('\t') for _, xy in lines]
    data = [xy for xy in data if len(xy) == 2]
  return data

path = "prim_fwd.train" 
data = read_data(path, 1000)

In [8]:
print(data[0])
print(data[0][0])

["sub Y' x", 'mul div INT+ 1 INT+ 2 pow x INT+ 2']
sub Y' x


# Pre-Processing the Data
Here we pre-process our data, so that it matches the format in our reference code: [Hugging Face Translation Task Example](https://github.com/huggingface/notebooks/blob/master/examples/translation.ipynb)

## Training Dataset

In [9]:
train_text = []
train_label = []
for i in range(len(data)):
    train_text.append(data[i][0])
    train_label.append(data[i][1])
raw_datasets= [{'en' : train_text[i], 'ro' : train_label[i]} for i in range(len(train_text))]
print(raw_datasets[1])
print(raw_datasets[2]['en'])
print(raw_datasets[2]['ro'])

{'en': "sub Y' mul INT+ 2 x", 'ro': 'pow x INT+ 2'}
sub Y' mul INT+ 2 pow x INT+ 2
mul div INT+ 2 INT+ 3 pow x INT+ 3


In [10]:
raw_datasets_train = {}
for i in range(len(raw_datasets)):
    raw_datasets_train.setdefault('translation',[]).append({'translation' : raw_datasets[i]})
# print(raw_datasets_train['translation'])
print(raw_datasets_train['translation'][1])

{'translation': {'en': "sub Y' mul INT+ 2 x", 'ro': 'pow x INT+ 2'}}


In [11]:
df2 = pd.DataFrame.from_dict(raw_datasets_train['translation']) 
df2

Unnamed: 0,translation
0,"{'en': 'sub Y' x', 'ro': 'mul div INT+ 1 INT+ ..."
1,"{'en': 'sub Y' mul INT+ 2 x', 'ro': 'pow x INT..."
2,"{'en': 'sub Y' mul INT+ 2 pow x INT+ 2', 'ro':..."
3,"{'en': 'sub Y' add pow x INT+ 2 mul INT+ 2 x',..."
4,"{'en': 'sub Y' mul INT+ 2 pow x INT+ 3', 'ro':..."
...,...
995,"{'en': 'sub Y' mul x exp INT+ 3', 'ro': 'mul d..."
996,{'en': 'sub Y' mul pow x INT- 1 pow ln x INT- ...
997,"{'en': 'sub Y' mul INT+ 3 pow x INT+ 5', 'ro':..."
998,"{'en': 'sub Y' add INT- 1 mul INT+ 3 x', 'ro':..."


In [12]:
train_dataset = Dataset.from_pandas(df2)
print(train_dataset)
print('******************')
print(train_dataset[9])
print('******************')
print(train_dataset[9]['translation'])

Dataset({
    features: ['translation'],
    num_rows: 1000
})
******************
{'translation': {'en': "sub Y' sqrt x", 'ro': 'mul div INT+ 2 INT+ 3 pow x div INT+ 3 INT+ 2'}}
******************
{'en': "sub Y' sqrt x", 'ro': 'mul div INT+ 2 INT+ 3 pow x div INT+ 3 INT+ 2'}


## Validation Dataset

In [13]:
path2 = "prim_fwd.valid" 
data2 = read_data(path2, 1000)
valid_text = []
valid_label = []
for i in range(len(data2)):
    valid_text.append(data2[i][0])
    valid_label.append(data2[i][1])
raw_datasets1= [{'en' : valid_text[i], 'ro' : valid_label[i]} for i in range(len(valid_text))]

In [15]:
raw_datasets_vlaid = {}
for i in range(len(raw_datasets1)):
    raw_datasets_vlaid.setdefault('translation',[]).append({'translation' : raw_datasets1[i]})
    
df_valid = pd.DataFrame.from_dict(raw_datasets_vlaid['translation']) 
valid_dataset = Dataset.from_pandas(df_valid)

# Tokenizing the Data

In [17]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-ro"  
metric = load_metric("sacrebleu")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=False)

if "mbart" in model_checkpoint:
    tokenizer.src_lang = "en-XX"
    tokenizer.tgt_lang = "ro-RO"
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "translate English to Romanian: "
else:
    prefix = ""

# Create the Final Data Set

In [21]:
datasetM = {'train': train_dataset, 'validation': valid_dataset}
datasetM

{'train': Dataset({
     features: ['translation'],
     num_rows: 1000
 }), 'validation': Dataset({
     features: ['translation'],
     num_rows: 1000
 })}

In [22]:
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "ro"
def preprocess_function_new(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [23]:
tokenized_datasets_train = datasetM['train'].map(preprocess_function_new, batched=True)
tokenized_datasets_train

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'translation'],
    num_rows: 1000
})

In [24]:
tokenized_datasets_valid = datasetM['validation'].map(preprocess_function_new, batched=True)
tokenized_datasets_valid

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'translation'],
    num_rows: 1000
})

#  Fine-tuning the model

In [25]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=300887193.0, style=ProgressStyle(descri…




In [26]:
batch_size = 16
args = Seq2SeqTrainingArguments(
    "test-translation",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
)

In [27]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [28]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [29]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,0.815863,30.3799,29.747
2,No log,0.702664,36.2071,33.34
3,No log,0.647983,42.2343,37.793
4,No log,0.641,41.9138,37.148
5,No log,0.63091,48.4854,42.412
6,No log,0.618091,46.9527,40.117
7,No log,0.609831,47.5092,40.411
8,0.457200,0.605447,49.5232,43.131
9,0.457200,0.6009,47.4455,40.602
10,0.457200,0.60243,47.3672,40.221


TrainOutput(global_step=630, training_loss=0.41710478767516124, metrics={'train_runtime': 2053.4235, 'train_samples_per_second': 0.307, 'total_flos': 94027380940800.0, 'epoch': 10.0, 'init_mem_cpu_alloc_delta': 1715433472, 'init_mem_gpu_alloc_delta': 300833792, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 145072128, 'train_mem_gpu_alloc_delta': 899699200, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 2672644608})

## Encoding and Decoding
This Part will be completed soon

In [None]:
text = tokenized_datasets_train['translation'][0]['en']

In [None]:
input_ids = tokenizer.encode(text, return_tensors="pt")

In [None]:
outputs = model.generate(input_ids.to(device = 'cuda'))
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
print(decoded)

add mul div INT+ 1 INT+ 2 mul div INT+ 2 mul div INT+ 2 mul div INT+ 2 mul div INT+ 2 mul div INT+ 2 mul div INT+ 2 mul div INT+ 2 mul div INT+ 2 pow x


In [None]:
actual = tokenized_datasetsll2['translation'][0]['ro']

In [None]:
from src.envs.sympy_utils import simplify

In [None]:
res = "OK" if simplify(decoded - actual, seconds=1) == 0 else "NO"
print(res)