In [None]:
!pip install transformers

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
%cd 'drive/My Drive/SymbolicMath/'

[Errno 2] No such file or directory: 'drive/My Drive/SymbolicMath/'
/content/drive/My Drive/SymbolicMath


In [12]:
import os
import io
import numpy as np
import sympy as sp
import torch

from src.utils import AttrDict
from src.envs import build_env
from src.model import build_modules

from src.utils import to_cuda
from src.envs.sympy_utils import simplify

from torch.utils.data import DataLoader
from functools import partial

from transformers.models.gpt2.modeling_gpt2 import GPT2Model

import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F

In [13]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [14]:
params = params = AttrDict({

    # environment parameters
    'env_name': 'char_sp',
    'int_base': 10,
    'balanced': False,
    'positive': True,
    'precision': 10,
    'n_variables': 1,
    'n_coefficients': 0,
    'leaf_probs': '0.75,0,0.25,0',
    'max_len': 512,
    'max_int': 5,
    'max_ops': 15,
    'max_ops_G': 15,
    'clean_prefix_expr': True,
    'rewrite_functions': '',
    'tasks': 'prim_fwd',
    'operators': 'add:10,sub:3,mul:10,div:5,sqrt:4,pow2:4,pow3:2,pow4:1,pow5:1,ln:4,exp:4,sin:4,cos:4,tan:4,asin:1,acos:1,atan:1,sinh:1,cosh:1,tanh:1,asinh:1,acosh:1,atanh:1',
})

In [15]:
env = build_env(params)         

In [16]:
def read_data(path):
  with io.open(path, mode='r', encoding='utf-8') as f:
    head = [next(f) for x in range(10000)]
    lines = [line.rstrip().split('|') for line in head]
    data = [xy.split('\t') for _, xy in lines]
    data = [xy for xy in data if len(xy) == 2]
  return data

path = 'prim_fwd.train'
data = read_data(path)
for i in range(len(data)):
    data[i] = tuple([sent.split(" ") for sent in data[i]])
# data[0] would be like : 
# data[0]
# ["sub Y' pow x INT+ 2", 'mul div INT+ 1 INT+ 3 pow x INT+ 3']

In [17]:
def batch_sequences(x, y, env):
      """
      Take as input a list of n sequences (torch.LongTensor vectors) and return
      a tensor of size (slen, n) where slen is the length of the longest
      sentence, and a vector lengths containing the length of each sentence.
      """
      lengths_x = torch.LongTensor([len(s) + 2 for s in x])
      lengths_y = torch.LongTensor([len(s) + 2 for s in y])
      max_length = max(lengths_x.max().item(), lengths_y.max().item())
      sent_x = torch.LongTensor(max_length , lengths_x.size(0)).fill_(env.pad_index)
      sent_y = torch.LongTensor(max_length, lengths_y.size(0)).fill_(env.pad_index)
      assert lengths_x.min().item() > 2
      assert lengths_y.min().item() > 2

      sent_x[0] = env.eos_index
      for i, s in enumerate(x):
          sent_x[1:lengths_x[i] - 1, i].copy_(s)
          sent_x[lengths_x[i] - 1, i] = env.eos_index

      sent_y[0] = env.eos_index
      for i, s in enumerate(y):
          sent_y[1:lengths_y[i] - 1, i].copy_(s)
          sent_y[lengths_y[i] - 1, i] = env.eos_index

      return sent_x, sent_y, max_length

def collate_fn(elements):
    """
    Collate samples into a batch.
    """
    x, y = zip(*elements)
    nb_ops = [sum(int(word in env.OPERATORS) for word in seq) for seq in x]
    x = [torch.LongTensor([env.word2id[w] for w in seq if w in env.word2id]) for seq in x]
    y = [torch.LongTensor([env.word2id[w] for w in seq if w in env.word2id]) for seq in y]
    x, y, length = batch_sequences(x, y, env)
    return (x, length), (y, length), torch.LongTensor(nb_ops)

In [18]:
loader = DataLoader(data, batch_size = 1, shuffle= False, collate_fn=collate_fn)
# loader.dataset

In [19]:
# Go through one loop
counter = 0
for (x, x_len), (y, y_len), nb_ops in loader:
  print(f"Iteration {counter}")
  print("Batched Input:")
  print(x, x_len)
  print("Batched Labels:")
  print(y, y_len)
  print("Batched Lengths:")
  print(nb_ops)
  print("")
  break

Iteration 0
Batched Input:
tensor([[ 0],
        [67],
        [79],
        [12],
        [ 0],
        [ 1],
        [ 1],
        [ 1],
        [ 1],
        [ 1],
        [ 1],
        [ 1]]) 12
Batched Labels:
tensor([[ 0],
        [54],
        [47],
        [71],
        [82],
        [71],
        [83],
        [55],
        [12],
        [71],
        [83],
        [ 0]]) 12
Batched Lengths:
tensor([1])



In [20]:
print('batched input shape:', x.shape)
print('batched output shape:', y.shape)
# each column is showing one training example

batched input shape: torch.Size([12, 1])
batched output shape: torch.Size([12, 1])


In [21]:
gpt2 = GPT2Model.from_pretrained('gpt2')
in_layer = nn.Embedding(len(env.word2id), 768)
out_layer = nn.Linear(768, len(env.word2id)) # or flattening or softmax or non-linear !
# [1, 45, 76, 2, 4] = ['sin', 'sub',...]
# sin     sub     cos     add
# 0.9     0.05   0.05       0  -> sin 0.9 =  1/sum(1+45+...)
# 0       0.8     0.2       0 -> sub

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
for name, param in gpt2.named_parameters():
    # freeze all parameters except the layernorm and positional embeddings
    if 'ln' in name or 'wpe' or 'attn' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [23]:
parameters = list(gpt2.parameters()) + list(in_layer.parameters()) + list(out_layer.parameters())
optimizer = torch.optim.Adam(parameters, lr= 1e-4)
loss_fn = nn.CrossEntropyLoss()

In [24]:
for layer in (gpt2, in_layer, out_layer):
    layer.to(device=device)
    layer.train()

In [25]:
batch_size = 1
accuracies = list()
for i in range(1):

  for (x, x_len), (y, y_len), nb_ops in loader:

      x = x.to(device = device)
      y = y.to(device = device)
      x = x.reshape(-1)
      y = y.reshape(-1)

      embeddings = in_layer(x.reshape(1, -1))
      hidden_state = gpt2(inputs_embeds=embeddings).last_hidden_state[:, :]
      logits = out_layer(hidden_state)[0]
      loss = loss_fn(logits, y.reshape(-1))
      accuracies.append((logits.argmax(dim=-1) == y.reshape(-1)).float().mean().item())

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      if len(accuracies) % 500 == 0:
          accuracy = sum(accuracies[-500:]) / len(accuracies[-500:])
          print(f'Samples: {len(accuracies)}, Accuracy: {accuracy}')

    
print(f'Final accuracy: {sum(accuracies[-500:]) / len(accuracies[-500:])}')

Samples: 500, Accuracy: 0.27992099096626044
Samples: 1000, Accuracy: 0.3018446253761649
Samples: 1500, Accuracy: 0.32053146590292453
Samples: 2000, Accuracy: 0.33168625304847954
Samples: 2500, Accuracy: 0.3450699903070927
Samples: 3000, Accuracy: 0.3429080805182457
Samples: 3500, Accuracy: 0.33180725771188735
Samples: 4000, Accuracy: 0.34912498396635055
Samples: 4500, Accuracy: 0.3502140007168055
Samples: 5000, Accuracy: 0.34043167004734276
Samples: 5500, Accuracy: 0.34909772196412087
Samples: 6000, Accuracy: 0.35880144103616474
Samples: 6500, Accuracy: 0.37218450343608855
Samples: 7000, Accuracy: 0.36606720093637707
Samples: 7500, Accuracy: 0.3967018135525286
Samples: 8000, Accuracy: 0.37700992634892466
Samples: 8500, Accuracy: 0.3769231962412596
Samples: 9000, Accuracy: 0.3825420660376549
Samples: 9500, Accuracy: 0.4044523888081312
Samples: 10000, Accuracy: 0.4025467271581292
Final accuracy: 0.4025467271581292
