# Solving Machine Learning Problems

In [1]:
!pip install 'sentencepiece==0.1.91' --force-reinstall
!pip install transformers
!pip install dgl
!pip install torchvision

Collecting sentencepiece==0.1.91
  Using cached sentencepiece-0.1.91-cp38-cp38-manylinux1_x86_64.whl (1.1 MB)
Installing collected packages: sentencepiece
  Attempting uninstall: sentencepiece
    Found existing installation: sentencepiece 0.1.91
    Uninstalling sentencepiece-0.1.91:
      Successfully uninstalled sentencepiece-0.1.91
Successfully installed sentencepiece-0.1.91


In [2]:
from transformers import T5Tokenizer, T5Model

In [3]:
## Needs: pytorch, dgl, transformers, Python>=3.7

from copy import copy
from tqdm import tqdm, trange
import itertools
import importlib

import random
import math
import time
import numpy as np
import pickle
from interruptingcow import timeout, Quota

import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl
from dgl.nn import GraphConv
false = False
true = True
NaN = float("NaN")
import matplotlib.pyplot as plt
from collections import Counter

import util
importlib.reload(util)
from util import setup, check_match, evaluate_prefix_expression, sub_nP, get_quant_cells

Using backend: pytorch


# Converting Inputs to Torch Tensors

In [4]:
def tensorize_data(train_data, test_data):
#     print(f"Number of items: {len(train_data)+len(test_data)}")
    for d in tqdm(itertools.chain(train_data, test_data)):
        d['in_idxs'] = torch.tensor([in_vocab.token2idx.get(x, in_vocab.unk) for x in d['in_tokens']])
        d['out_idxs'] = torch.tensor([out_vocab.token2idx.get(x, out_vocab.unk) for x in d['out_tokens']])
        d['n_in'] = n_in = len(d['in_idxs'])
        d['n_out'] = len(d['out_idxs'])
        d['n_nP'] = n_nP = len(d['nP'])
        d['nP_in_mask'] = mask = torch.zeros(n_in, dtype=torch.bool)
        mask[d['nP_positions']] = True
        d['nP_out_mask'] = mask = torch.zeros(n_max_nP, dtype=torch.bool)
        mask[:n_nP] = True
        d['qcomp_edges'] = get_quantity_comparison_edges(d)
        d['qcell_edges'] = get_quantity_cell_edges(d)

def get_quantity_comparison_edges(d):
    quants = [float(x) for x in d['nP']]
    quant_positions = d['nP_positions']
#     assert max(quant_positions) < d['n_in']
    adj_matrix = torch.eye(d['n_in'], dtype=np.bool)
    for x, x_pos in zip(quants, quant_positions):
        for y, y_pos in zip(quants, quant_positions):
            adj_matrix[x_pos, y_pos] |= x > y
    """
    Convert the adjacency matrix of the directed graph into a tuple of (src_edges, dst_edges), which
    is the input format of dgl.graph (see https://docs.dgl.ai/generated/dgl.graph.html).
    Hint: check out the 'nonzero' function
    """
    return adj_matrix.nonzero(as_tuple=True)

def get_quantity_cell_edges(d):
    in_idxs = d['in_idxs']
    quant_positions = d['nP_positions']
    quant_cell_positions = d['quant_cell_positions']
    assert max(quant_cell_positions) < d['n_in']
    word_cells = set(quant_cell_positions) - set(quant_positions)
    adj_matrix = torch.eye(d['n_in'], dtype=torch.bool)
    for w_pos in word_cells:
        for q_pos in quant_positions:
            if abs(w_pos - q_pos) < 4:
                adj_matrix[w_pos, q_pos] = adj_matrix[q_pos, w_pos] = True
    pos_idxs = in_idxs[quant_cell_positions]
    for idx1, pos1 in zip(pos_idxs, quant_cell_positions):
        for idx2, pos2 in zip(pos_idxs, quant_cell_positions):
            if idx1 == idx2:
                adj_matrix[pos1, pos2] = adj_matrix[pos2, pos1] = True
    """
    Convert the adjacency matrix of the directed graph into a tuple of (src_edges, dst_edges), which
    is the input format of dgl.graph (see https://docs.dgl.ai/generated/dgl.graph.html).
    Hint: check out the 'nonzero' function
    """
    return adj_matrix.nonzero(as_tuple=True)

# Model

In [106]:
class TransformerAttention(nn.Module):
    """
    Used in Transformer Block
    """
    def __init__(self):
        super().__init__()
        self.qkv = nn.Linear(n_hid, n_head * (n_k * 2 + n_v))
        self.out = nn.Linear(n_head * n_v, n_hid)

    def forward(self, x, mask=None):
        n_batch, n_batch_max_in, n_hid = x.shape
        q_k_v = self.qkv(x).view(n_batch, n_batch_max_in, n_head, 2 * n_k + n_v).transpose(1, 2)
        q, k, v = q_k_v.split([n_k, n_k, n_v], dim=-1)

        q = q.reshape(n_batch * n_head, n_batch_max_in, n_k)
        k = k.reshape_as(q).transpose(1, 2)
        qk = q.bmm(k) / np.sqrt(n_k)

        if mask is not None:
            qk = qk.view(n_batch, n_head, n_batch_max_in, n_batch_max_in).transpose(1, 2)
            qk[~mask] = -np.inf
            qk = qk.transpose(1, 2).view(n_batch * n_head, n_batch_max_in, n_batch_max_in)
        qk = qk.softmax(dim=-1)
        v = v.reshape(n_batch * n_head, n_batch_max_in, n_v)
        qkv = qk.bmm(v).view(n_batch, n_head, n_batch_max_in, n_v).transpose(1, 2).reshape(n_batch, n_batch_max_in, n_head * n_v)
        out = self.out(qkv)
        return x + out

class TransformerBlock(nn.Module):
    """
    Custom Transformer
    """
    def __init__(self):
        super().__init__()
        self.attn = TransformerAttention()
        n_inner = n_hid * 4
        self.inner = nn.Sequential(
            nn.Linear(n_hid, n_inner),
            nn.ReLU(inplace=True),
            nn.Linear(n_inner, n_hid)
        )

    def forward(self, x, mask=None):
        x = x + self.attn(x, mask=mask)
        return x + self.inner(x)
    
class GCNBranch(nn.Module):
    def __init__(self, n_hid_in, n_hid_out, dropout=0.3):
        super().__init__()
        """
        Define a branch of the graph convolution with
        1. GraphConv from n_hid_in to n_hid_in
        2. ReLU
        3. Dropout
        4. GraphConv from n_hid_in to n_hid_out
        
        Note: your should call GraphConv with allow_zero_in_degree=True
        """
        self.gc1 = GraphConv(n_hid_in, n_hid_in, allow_zero_in_degree=True)
        self.drelu = nn.Sequential(
            nn.ReLU(inplace=True),
            nn.Dropout(dropout)
        )
        self.gc2 = GraphConv(n_hid_in, n_hid_out, allow_zero_in_degree=True)

    def forward(self, x, graph):
        """
        Forward pass of your defined branch above
        """
        return self.gc2(graph, self.drelu(self.gc1(graph, x)))

class GCN(nn.Module):
    def __init__(self, n_head=4, dropout=0.3):
        super().__init__()
        self.branches = nn.ModuleList(GCNBranch(n_hid, n_hid // n_head, dropout) for _ in range(n_head))

        self.feed_forward = nn.Sequential(
            nn.Linear(n_hid, n_hid),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(n_hid, n_hid)
        )
        self.layer_norm = nn.LayerNorm(n_hid)

    def forward(self, h, gt_graph, attr_graph):
        x = h.reshape(-1, n_hid)
        graphs = [gt_graph, gt_graph, attr_graph, attr_graph]
        x = torch.cat([branch(x, g) for branch, g in zip(self.branches, graphs)], dim=-1).view_as(h)
        x = h + self.layer_norm(x)
        return x + self.feed_forward(x)

class Gate(nn.Module):
    def __init__(self, n_in, n_out):
        super(Gate, self).__init__()
        self.t = nn.Linear(n_in, n_out)
        self.s = nn.Linear(n_in, n_out)

    def forward(self, x):
        return self.t(x).tanh() * self.s(x).sigmoid()

class TreeDecoder(nn.Module):
    def __init__(self, dropout=0.5):
        super().__init__()
        drop = nn.Dropout(dropout)
        self.constant_embedding = nn.Parameter(torch.randn(1, out_vocab.n_constants, n_hid))

        self.qp_gate = nn.Sequential(drop, Gate(n_hid, n_hid))
        self.gts_right = nn.Sequential(drop, Gate(2 * n_hid, n_hid))

        self.attn_fc = nn.Sequential(drop,
            nn.Linear(2 * n_hid, n_hid),
            nn.Tanh(),
            nn.Linear(n_hid, 1)
        )
        self.quant_fc = nn.Sequential(drop,
            nn.Linear(n_hid * 3, n_hid),
            nn.Tanh(),
            nn.Linear(n_hid, 1, bias=False)
        )
        self.op_fc = nn.Sequential(drop, nn.Linear(n_hid * 2, out_vocab.n_ops))

        self.op_embedding = nn.Embedding(out_vocab.n_ops + 1, n_hid, padding_idx=out_vocab.n_ops)
        self.gts_left = nn.Sequential(drop, Gate(n_hid * 2 + n_hid, n_hid))
        self.gts_left_qp = nn.Sequential(drop, Gate(n_hid * 2 + n_hid, n_hid), self.qp_gate)

        self.subtree_gate = nn.Sequential(drop, Gate(n_hid * 2 + n_hid, n_hid))

    def gts_attention(self, q, zbar, in_mask=None):
        attn_score = self.attn_fc(
            torch.cat([q.unsqueeze(1).expand_as(zbar), zbar], dim=2)
        ).squeeze(2)
        if in_mask is not None:
            attn_score[~in_mask] = -np.inf
        attn = attn_score.softmax(dim=1)
        return (attn.unsqueeze(1) @ zbar).squeeze(1) # (n_batch, n_hid)

    def gts_predict(self, qp_Gc, quant_embed, nP_out_mask=None):
        quant_score = self.quant_fc(
            torch.cat([qp_Gc.unsqueeze(1).expand(-1, quant_embed.size(1), -1), quant_embed], dim=2)
        ).squeeze(2)
        op_score = self.op_fc(qp_Gc)
        pred_score = torch.cat((op_score, quant_score), dim=1)
        if nP_out_mask is not None:
            pred_score[:, out_vocab.base_nP:][~nP_out_mask] = -np.inf
        return pred_score

    def merge_subtree(self, op, tl, yr):
        return self.subtree_gate(torch.cat((op, tl, yr), dim=-1))

class Model(nn.Module):
    def __init__(self, dropout=0.5):
        super().__init__()
        drop = nn.Dropout(dropout)

        if use_t5:
            """
            Use t5_model.encoder as the encoder for this model. Note that unlike the custom transformer, you don't
            need to use an external positional embedding for the T5 transformer (i.e. don't define self.pos_emb)
            
            You may specify layer weights to freeze during finetuning by modifying the freeze_layers global variable
            """
            self.t5_encoder = t5_model.encoder
            
            for i_layer, block in enumerate(self.t5_encoder.block):
                if i_layer in freeze_layers:
                    for param in block.parameters():
                        param.requires_grad = False
        else:
            self.in_embed = nn.Sequential(nn.Embedding(in_vocab.n, n_hid, padding_idx=in_vocab.pad), drop)
            self.pos_embed = nn.Embedding(1 + n_max_in, n_hid) # Use the first position as global vector
            self.transformer_layers = nn.ModuleList(TransformerBlock() for _ in range(n_layers))

        self.gcn = GCN()

        self.decoder = TreeDecoder()

        if not use_t5:
            self.apply(self.init_weight)

    def init_weight(self, m):
        if type(m) in [nn.Embedding]:
            nn.init.normal_(m.weight, 0, 0.1)

    def encode(self, in_idxs, n_in, gt_graph, attr_graph, in_mask=None):
        in_idxs_pad = F.pad(in_idxs, (1, 0), value=in_vocab.pad)
        if use_t5:
            """
            Call your T5 encoder
            """
#             h, = self.t5_encoder(in_idxs_pad)
            h = self.t5_encoder(in_idxs_pad).last_hidden_state
        else:
            x = self.in_embed(in_idxs_pad) # (n_batch, n_batch_max_in, n_hid)
            h = x + self.pos_embed(torch.arange(x.size(1), device=x.device))
            for layer in self.transformer_layers:
                h = layer(h, mask=in_mask)
                
        zg, h = h[:, 0], h[:, 1:]
        zbar = self.gcn(h, gt_graph, attr_graph)
        return zbar, zg

# Training a Batch

In [105]:
class Node:
    def __init__(self, up):
        self.up = up
        self.is_root = up is None
        self.left = self.right = None
        self.ql = self.tl = self.op = None

def train(batch, model, opt):
    n_batch = len(batch)

    n_in = [d['n_in'] for d in batch]
    pad = lambda x, value: nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=value)
    in_idxs = pad([d['in_idxs'] for d in batch], in_vocab.pad).to(device)
    in_mask = pad([torch.ones(n, dtype=torch.bool) for n in n_in], False).to(device)
    nP_in_mask = pad([d['nP_in_mask'] for d in batch], False).to(device)
    nP_out_mask = torch.stack([d['nP_out_mask'] for d in batch]).to(device)
    
    qcomp_graph, qcell_graph = [], []
    for d in batch:
        """
        Create qcomp_graph and qcell_graph from d['qcomp_edges'] and d['qcell_edges'] by calling dgl.graph
        (see https://docs.dgl.ai/generated/dgl.graph.html)

        Note that num_nodes needs to be set to the maximum input length in this batch
        """
        qcomp_graph_i = dgl.graph(d['qcomp_edges'], num_nodes=in_idxs.size(1), device=device)
        qcell_graph_i = dgl.graph(d['qcell_edges'], num_nodes=in_idxs.size(1), device=device)
        
        qcomp_graph.append(qcomp_graph_i)
        qcell_graph.append(qcell_graph_i)
    qcomp_graph = dgl.batch(qcomp_graph)
    qcell_graph = dgl.batch(qcell_graph)
    
    label = pad([d['out_idxs'] for d in batch], out_vocab.pad)
    nP_candidates = [d['nP_candidates'] for d in batch]

    
    zbar, qroot = model.encode(in_idxs, n_in, qcomp_graph, qcell_graph, in_mask=None)
    z_nP = zbar.new_zeros((n_batch, n_max_nP, n_hid))
    z_nP[nP_out_mask] = zbar[nP_in_mask]

    decoder = model.decoder

    n_quant = out_vocab.n_constants + n_max_nP
    quant_embed = torch.cat([decoder.constant_embedding.expand(n_batch, -1, -1), z_nP], dim=1) # (n_batch, n_quant, n_hid)

    nodes = np.array([Node(None) for _ in range(n_batch)])
    op_min, op_max = out_vocab.base_op, out_vocab.base_op + out_vocab.n_ops
    quant_min, quant_max = out_vocab.base_quant, out_vocab.base_quant + n_quant

    # Initialize root node vector according to zg (the global context)
    qp = decoder.qp_gate(qroot)
    scores = []
    for i, label_i in enumerate(label.T): # Iterate over the output positions
        Gc = decoder.gts_attention(qp, zbar, in_mask)
        qp_Gc = torch.cat([qp, Gc], dim=1) # (n_batch, 2 * n_hid)

        score = decoder.gts_predict(qp_Gc, quant_embed, nP_out_mask)
        scores.append(score)

        # Whether the label is an operator
        is_op = (op_min <= label_i) & (label_i < op_max)
        # Whether the label is a quantity
        is_quant = ((quant_min <= label_i) & (label_i < quant_max)) | (label_i == out_vocab.unk)

        op_embed = decoder.op_embedding((label_i[is_op] - out_vocab.base_op).to(device))
        qp_Gc_op = torch.cat([qp_Gc[is_op], op_embed], dim=1)

        is_left = np.zeros(n_batch, dtype=np.bool)
        qleft_qp = decoder.gts_left_qp(qp_Gc_op)
        qleft = decoder.gts_left(qp_Gc_op)
        for j, ql, op in zip(is_op.nonzero(as_tuple=True)[0], qleft, op_embed):
            node = nodes[j]
            nodes[j] = node.left = Node(node)
            node.op = op
            node.ql = ql
            is_left[j] = True

        is_right = np.zeros(n_batch, dtype=np.bool)
        nP_score = score[:, out_vocab.base_nP:].detach().cpu()
        ql_tl = []
        for j in is_quant.nonzero(as_tuple=True)[0]:
            if label_i[j] == out_vocab.unk:
                candidates = nP_candidates[j][i]
#                 label_i[j] = out_vocab.base_nP + candidates[nP_score[j, candidates].argmax()]
                label_i[j] = torch.from_numpy(np.array(out_vocab.base_nP + candidates[nP_score[j, candidates].argmax()])).to(label_i)

            node = nodes[j]
            pnode = node.up
            t = quant_embed[j, label_i[j] - out_vocab.base_quant]
            while pnode and pnode.right is node:
                t = decoder.merge_subtree(pnode.op, pnode.tl, t) # merge operator, left subtree, and right child
                node, pnode = pnode, pnode.up # backtrack to parent node
            if pnode is None: # Finished traversing tree of j
                continue
            # Now pnode.left is node. t is the tl representing the left subtree of pnode
            pnode.tl = t
            ql_tl.append(torch.cat([pnode.ql, pnode.tl])) # For computing qright
            nodes[j] = pnode.right = Node(pnode)
            is_right[j] = True

        qp = torch.zeros((n_batch, n_hid), device=device)
        qp[is_left] = qleft_qp
        if ql_tl:
            qp[is_right] = decoder.gts_right(torch.stack(ql_tl))

    label = label.to(device).view(-1)
    scores = torch.stack(scores, dim=1).view(-1, out_vocab.n_ops + n_quant)
    loss = F.cross_entropy(scores, label, ignore_index=out_vocab.pad)

    opt.zero_grad()
    loss.backward()
    opt.step()
    return loss.item()

# Evaluation

In [7]:
class BeamNode(Node):
    def __init__(self, up, prev, qp, token=None):
        super().__init__(up)
        self.prev = prev
        self.qp = qp
        self.token = token

    def trace_tokens(self, *last_token):
        if self.prev is None:
            return list(last_token)
        tokens = self.prev.trace_tokens()
        tokens.append(self.token)
        tokens.extend(last_token)
        return tokens

def evaluate(d, model, beam_size=5, n_max_out=45):
    in_idxs = d['in_idxs'].unsqueeze(0).to(device=device)
    """
    Create qcomp_graph and qcell_graph from d['qcomp_edges'] and d['qcell_edges'] by calling dgl.graph
    (see https://docs.dgl.ai/generated/dgl.graph.html)
    """
#     qcomp_graph = dgl.graph(d['gt_edges'], device=device)
#     qcell_graph = dgl.graph(d['attr_edges'], device=device)
    qcomp_graph = dgl.graph(d['qcomp_edges'], device=device)
    qcell_graph = dgl.graph(d['qcell_edges'], device=device)

    zbar, qroot = model.encode(in_idxs, [d['n_in']], qcomp_graph, qcell_graph)
    z_nP = zbar[:, d['nP_positions']]

    decoder = model.decoder

    quant_embed = torch.cat([decoder.constant_embedding, z_nP], dim=1) # (1, n_quant, n_hid)
    op_min, op_max = out_vocab.base_op, out_vocab.base_op + out_vocab.n_ops

    best_done_beam = (-np.inf, None, None)
    beams = [(0, BeamNode(up=None, prev=None, qp=decoder.qp_gate(qroot)))]
    for _ in range(n_max_out):
        new_beams = []
        for logp_prev, node in beams:
            Gc = decoder.gts_attention(node.qp, zbar)
            qp_Gc = torch.cat([node.qp, Gc], dim=1) # (2 * n_hid,)

            log_prob = decoder.gts_predict(qp_Gc, quant_embed).log_softmax(dim=1)
            top_logps, top_tokens = log_prob.topk(beam_size, dim=1)
            for logp_token_, out_token_ in zip(top_logps.unbind(dim=1), top_tokens.unbind(dim=1)):
                out_token = out_token_.item()
                logp = logp_prev + logp_token_.item()
                if op_min <= out_token < op_max:
                    op_embed = decoder.op_embedding(out_token_)
                    qp_Gc_op = torch.cat([qp_Gc, op_embed], dim=1)
                    prev_node = copy(node)
                    next_node = prev_node.left = BeamNode(
                        up=prev_node, prev=prev_node,
                        qp=decoder.gts_left_qp(qp_Gc_op),
                        token=out_token
                    )
                    prev_node.op = op_embed
                    prev_node.ql = decoder.gts_left(qp_Gc_op)
                else:
                    pnode, prev_node = node.up, node
                    t = quant_embed[:, out_token - out_vocab.base_quant]
                    while pnode and pnode.tl is not None:
                        t = decoder.merge_subtree(pnode.op, pnode.tl, t)
                        node, pnode = pnode, pnode.up
                    if pnode is None:
                        best_done_beam = max(best_done_beam, (logp, prev_node, out_token))
                        continue
                    pnode = copy(pnode)
                    pnode.tl = t
                    next_node = pnode.right = BeamNode(
                        up=pnode, prev=prev_node,
                        qp=decoder.gts_right(torch.cat([pnode.ql, pnode.tl], dim=1)),
                        token=out_token
                    )
                new_beams.append((logp, next_node))
        beams = sorted(new_beams, key=lambda x: x[0], reverse=True)[:beam_size]
        done_logp, done_node, done_last_token = best_done_beam
        if not len(beams) or done_logp >= beams[0][0]:
            break
    return done_node.trace_tokens(done_last_token)

# Scoring

In [50]:
with open("data/question-to-topic-cleaned.json", "r") as f:
    question_to_topic = eval(f.readline())

In [11]:
delete_chars = {',', '.', ' ', 'negative', '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
def cleaner(q):
    for char in delete_chars:
        q = q.replace(char, '')
    return q
    
def score_model(model, test_data):
    model.eval()
    value_match, equation_match = [], []
    with torch.no_grad():
        for d in tqdm(test_data):
            val_match = eq_match = False
            if not d['is_quadratic']: # This method is not equiped to handle equations with quadratics
                try:
                    pred = evaluate(d, model)
                    d['pred_tokens'] = [out_vocab.idx2token[idx] for idx in pred]
                    val_match, eq_match = check_match(pred, d)
                except:
                    print("pred:", pred, "\nd[processed_question]:", d['processed_question'])
            value_match.append(val_match)
            equation_match.append(eq_match)
    print(f'Test equation accuracy: {np.mean(equation_match):.3g}')
    print(f'Test value accuracy: {np.mean(value_match):.3g}')
    
def score_model_detailed(model, test_data):
    model.eval()
    value_match, equation_match = [], []
    correct_val_topics = []
    correct_eqn_topics = []
    correct_questions_val = {} # topic -> (question, answer)
    incorrect_questions_val = {} # topic -> (question, answer)
    topic_count_in_test = {}
    with torch.no_grad():
        for d in tqdm(test_data):
            val_match = eq_match = False
            pred_val = None
            if not d['is_quadratic']: # This method is not equiped to handle equations with quadratics
                try:
                    pred = evaluate(d, model)
                    d['pred_tokens'] = [out_vocab.idx2token[idx] for idx in pred]
                    val_match, eq_match = check_match(pred, d)
                    pred_val = sub_nP(d['pred_tokens'], d['nP'])
                except:
                    print("pred:", pred, "\nd[processed_question]:", d['processed_question'])
            value_match.append(val_match)
            equation_match.append(eq_match)
            
            cleaned_question_topic = question_to_topic[cleaner(d['processed_question'])]
            topic_count_in_test[cleaned_question_topic] = topic_count_in_test.get(cleaned_question_topic, 0) + 1
            if val_match:
                correct_val_topics.append(cleaned_question_topic)
                correct_questions_val[cleaned_question_topic] = correct_questions_val.get(cleaned_question_topic, []) + [(d['processed_question'], evaluate_prefix_expression(pred_val))]
            else:
                try:
                    incorrect_questions_val[cleaned_question_topic] = incorrect_questions_val.get(cleaned_question_topic, []) + [(d['processed_question'], evaluate_prefix_expression(pred_val))]
                except:
                    incorrect_questions_val[cleaned_question_topic] = incorrect_questions_val.get(cleaned_question_topic, []) + [(d['processed_question'], pred_val)]
            if eq_match:
                correct_eqn_topics.append(cleaned_question_topic)
                
    print(f'Test equation accuracy: {np.mean(equation_match):.3g}')
    correct_eqn_topics = Counter(correct_eqn_topics)
    correct_eqn_percents = {topic: questions_correct/topic_count_in_test[topic] for topic,questions_correct in correct_eqn_topics.items()}
    print(f'Test equation accuracy per topic: {correct_eqn_percents}')
          
    print(f'Test value accuracy: {np.mean(value_match):.3g}')
    correct_val_topics = Counter(correct_val_topics)
    correct_val_percents = {topic: questions_correct/topic_count_in_test[topic] for topic,questions_correct in correct_val_topics.items()}
    print(f'Test value accuracy per topic: {correct_val_percents}')
    
    return correct_questions_val, incorrect_questions_val
    
    
def score_model_single_input_fr(model, question):
    model.eval()
    if question[-1] not in {'.', '?'}:
        question += "?"
    d = [{"expression": "", "quant_cell_positions": [i for i in range(len(question.split(" ")))], "processed_question": question, "raw_question": question}]
    _, input_question_data, _, _, _, _ = setup(use_t5, test_split=1, data=d)
    tensorize_data([], input_question_data)
    input_question = input_question_data[0]
    pred = evaluate(input_question, model)
    d[0]['pred_tokens'] = [out_vocab.idx2token[idx] for idx in pred]
    parse_tree = sub_nP(d[0]['pred_tokens'], d[0]['nP'])
    return str(evaluate_prefix_expression(parse_tree)), parse_tree

def score_model_single_input_mc(model, question, solution_tree, answers_generated=20, num_choices=4, verbose=True):
    model.eval()
    if question[-1] not in {'.', '?'}:
        question += "?"
    d = [{"expression": "", "quant_cell_positions": [i for i in range(len(question.split(" ")))], "processed_question": question, "raw_question": question}]
    _, input_question_data, _, _, _, _ = setup(use_t5, test_split=1, data=d)
    tensorize_data([], input_question_data)
    input_question = input_question_data[0]
    
    result = []
    for _ in range(answers_generated):
        try:
    #         print(11)
    #         print(d)
            pred = evaluate(d[0], model)
    #         print(1, pred)
            res = sub_nP([out_vocab.idx2token[idx] for idx in pred], d[0]['nP'])
    #         print(2, res)
            res = evaluate_prefix_expression(res)
    #         print(3, res)
            result.append(str(res))
        except Exception as e:
            print(e)
            pass
    counts = Counter(result)
    counts = sorted(counts.items(), key=lambda x: eval(x[0]))
    preds = [eval(elt[0]) for elt in sorted(counts, key=lambda x: (x[1], random.random()), reverse=True)][:min(len(counts),num_choices)]
    
    correct_tree = sub_nP(solution_tree, d[0]['nP'])
    # generate answer choices
    answers, correct_answer = generate_choices(correct_tree, num_choices)
    
    if verbose:
        print(f"Correct answer: {correct_answer}")
        print(f"Answers: {answers}")
    
    # have model make decision
    for try_number in range(num_choices):
        chosen_answer = random.choice(answers)
        for pred in preds:
            if pred in answers:
                chosen_answer = pred
                break # if not chosen after iterating over all preds, keep the chosen one at random
        del answers[answers.index(chosen_answer)]
        if verbose:
            print(f"Try #{try_number+1}: {chosen_answer}")
        if chosen_answer == correct_answer:
            return (num_choices - try_number - 1)/(num_choices-1), try_number + 1
            
            
#     pred = evaluate(input_question, model)
#     d[0]['pred_tokens'] = [out_vocab.idx2token[idx] for idx in pred]
#     parse_tree = sub_nP(d[0]['pred_tokens'], d[0]['nP'])
#     return str(evaluate_prefix_expression(parse_tree)), parse_tree
        
    
def score_model_ranking_multiple_choice(model, test_data, num_tries=2, answers_generated=20, num_choices=4):
    model.eval()
    value_match = []
    tries = []
    tries_per_topic = {}
    topic_count_in_test = {}
    i = 0
    with torch.no_grad():
        for d in tqdm(test_data):
            try:
                with timeout(5, exception = RuntimeError):
                    if d['is_quadratic']: # This method is not equiped to handle equations with quadratics
                        val_match = eq_match = False
                        try_number = num_tries - 1
                    else:
                        # generate responses
                        result = []
                        for _ in range(answers_generated):
                            try:
                                pred = evaluate(d, model)
                                res = sub_nP([out_vocab.idx2token[idx] for idx in pred], d['nP'])
                                res = evaluate_prefix_expression(res)
                                result.append(str(res))
                            except:
                                pass
                        counts = Counter(result)
                        counts = sorted(counts.items(), key=lambda x: eval(x[0]))
                        preds = [eval(elt[0]) for elt in sorted(counts, key=lambda x: (x[1], random.random()), reverse=True)][:min(len(counts),num_tries)]

                        # generate answer choices
                        val_match = 0
                        try:
                            correct_tree = sub_nP(d['out_tokens'], d['nP'])
                            answers, correct_answer = generate_choices(correct_tree, num_choices)

                            # have model make decision
                            for try_number in range(len(preds)):
                                chosen_answer = random.choice(answers)
                                for pred in preds:
                                    if pred in answers:
                                        chosen_answer = pred
                                        break # if not chosen after iterating over all preds, keep the chosen one at random
                                del answers[answers.index(chosen_answer)]
                                if chosen_answer == correct_answer:
                                    val_match = (num_choices - try_number - 1)/(num_choices-1)
                                    question_topic = question_to_topic[cleaner(d["raw_question"])]
                                    tries.append(try_number + 1)
                                    tries_per_topic[question_topic] = tries_per_topic.get(question_topic, []) + [try_number + 1]
                                    break
                        except:
                            pass
                value_match.append(val_match)
            except Exception as e:
                print("timeout")
                value_match.append(1/num_choices)
            print(i)
            i += 1
            print(f'Test value accuracy: {np.mean(value_match):.3g}')
            print(f'Avg number of tries: {np.mean(tries):.3g}')
            print("Ave number of tries per topic:", {k:np.mean(tries_per_topic[k]) for k in tries_per_topic.keys()})
    
def generate_choices(parse_tree, num_choices, operators=None):
    if operators is None:
        operators = {'+': np.add, '-': np.subtract, '*': np.multiply, '/': np.divide, 'm': max, 'l':math.log, '^': np.power}
    special_values = [0.1, 0.2, 0.25, 0.4, 0.5, 0.6, 0.75, 0.8, 2, 2.5, 3, 4, 5, 6, 7.5, 8, 10]
    correct_answer = evaluate_prefix_expression(parse_tree)
    parse_tree = [elt if elt in operators else eval(elt) for elt in parse_tree]
    
    # find all other possible parse tree constructions
    good_trees = [parse_tree]
    bad_trees = []
    valid_answers = []
    for _ in range(1):
        good_trees_size = len(good_trees)
        for tree in good_trees:
            for idx in range(len(tree)-4):
                if parse_tree[idx] in operators:
                    if parse_tree[idx+1] in operators: # zig rotation
                        proposed_tree = zig_rotation(parse_tree, idx)
                    if parse_tree[idx+2] in operators: # zag rotation
                        proposed_tree = zag_rotation(parse_tree, idx)
                    if proposed_tree not in good_trees and proposed_tree not in bad_trees:
                        try:
                            answer = evaluate_prefix_expression(answer_tree)
                            good_trees.append(proposed_tree)
                            valid_answers.append(answer)
                        except:
                            bad_trees.append(proposed_tree)
                        if len(good_trees) >= num_choices:
                            break
            if len(good_trees) >= num_choices:
                break
        if good_trees_size == len(good_trees) or len(good_trees) >= num_choices:
            break

    while len(valid_answers) < num_choices:
        numeric_idxs = [idx for idx in range(len(parse_tree)) if parse_tree[idx] not in operators]
        numeric_idx = random.choice(numeric_idxs)
        parse_tree_augmented = parse_tree.copy()
        option = random.random()
        if option < 2/3:
            parse_tree_augmented[numeric_idx] *= random.choice(special_values)
        elif 1/3 < option:
            parse_tree_augmented[numeric_idx] += random.choice(special_values)
        else:
            parse_tree_augmented[numeric_idx] -= random.choice(special_values)
        try:
            result = evaluate_prefix_expression(parse_tree_augmented)
            if result in valid_answers or result == correct_answer:
                continue
            else:
                valid_answers.append(result)
        except:
            pass
        
    random.shuffle(valid_answers)
    correct_idx = int(random.random()*num_choices)
    valid_answers[correct_idx] = correct_answer
    return valid_answers[:num_choices], correct_answer
    
def zig_rotation(parse_tree, first_rotation_idx):
    tree = parse_tree.copy()
    tree[first_rotation_idx], tree[first_rotation_idx+1], tree[first_rotation_idx+2] = tree[first_rotation_idx+1], tree[first_rotation_idx+2], tree[first_rotation_idx]
    return tree

def zag_rotation(parse_tree, first_rotation_idx):
    tree = parse_tree.copy()
    tree[first_rotation_idx], tree[first_rotation_idx+1], tree[first_rotation_idx+2] = tree[first_rotation_idx+2], tree[first_rotation_idx], tree[first_rotation_idx+1]
    return tree

# Training Hyperparameters

In [92]:
use_t5 = None

n_max_in = 100
n_batch = 1
learning_rate = 1e-4
if use_t5:
    # T5 hyperparameters
    n_epochs = 50
    freeze_layers = []
    weight_decay = 1e-5
    n_hid = dict(small=512, base=768)[use_t5] # Do not modify unless you want to try t5-large
else:
    # Custom transformer hyperparameters
    n_epochs = 1
    n_layers = 3
    n_hid = 512
    n_k = n_v = 64
    n_head = 8
    weight_decay = 0
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

topics = ["b", "p", "f", "lr", "r", "nn_i", "nn_ii", "cnn", "sm_mdp", "rl", "rnn", "dtnn"]

# Loading the Data

### Create from json

In [51]:
print("Printing 10 examples...\n")
train_data, test_data, in_vocab, out_vocab, n_max_nP, t5_model = setup(use_t5, path="data/train-cleaned.json")
print("\nTensorizing...")
tensorize_data(train_data, test_data)

Printing 10 examples...



17it [00:00, 169.43it/s]


Tensorizing...


14000it [01:37, 143.75it/s]


### Save to pickle

In [52]:
def save_data_split():
    filename = 'setup_data_split.pickle'
    with open(filename, 'wb') as f:
        pickle.dump([train_data, test_data, n_max_nP], f)
        print(f'Saved to {filename}')
        
def save_vocab_model():
    suffix = str(use_t5) if use_t5 is not None else "none"
    filename = f'setup_vocab_model_t5_{suffix}.pickle'
    with open(filename, 'wb') as f:
        pickle.dump([in_vocab, out_vocab, t5_model], f)
        print(f'Saved to {filename}')
if not use_t5:
    save_data_split() # if using T5, no need to save data split again
save_vocab_model()

Saved to setup_data_split.pickle
Saved to setup_vocab_model_t5_none.pickle


### Load from pickle

In [53]:
def load_data_split():
    with open('setup_data_split.pickle', 'rb') as f:
        print("Opening setup_data_split.pickle")
        return pickle.load(f)
        
def load_vocab_model():
    suffix = str(use_t5) if use_t5 is not None else "none"
    filename = f'setup_vocab_model_t5_{suffix}.pickle'
    with open(filename, 'rb') as f:
        print(f"Opening {filename}")
        return pickle.load(f)
train_data, test_data, n_max_nP = load_data_split()
in_vocab, out_vocab, t5_model = load_vocab_model()

Opening setup_data_split.pickle
Opening setup_vocab_model_t5_none.pickle


In [85]:
new_train_data = [None] * 120
new_test_data = []
for d in data:
    if d['raw_question'] in questions_120:
        idx = questions_120.index(d['raw_question'])
        new_train_data[idx] = d.copy()
new_train_data

[{'expression': '((5^2)+(1^2)+(1^2))^0.5',
  'quant_cell_positions': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15],
  'processed_question': 'Let an input vector be [ 5 1 1 ] . What is its magnitude ?',
  'raw_question': 'Let an input vector be [ 5 1 1 ] . What is its magnitude ?',
  'is_quadratic': False,
  'Id': 226,
  'Expected': 5.196152422706632,
  'in_tokens': ['Let',
   'an',
   'input',
   'vector',
   'be',
   '[',
   'NUM',
   'NUM',
   'NUM',
   ']',
   '.',
   'What',
   'is',
   'its',
   'magnitude',
   '?'],
  'out_tokens': ['^',
   '+',
   '+',
   '^',
   (0,),
   '2',
   '^',
   (1, 2),
   '2',
   '^',
   (1, 2),
   '2',
   '0.5'],
  'nP': array(['5', '1', '1'], dtype='<U1'),
  'nP_positions': array([6, 7, 8]),
  'nP_candidates': {7: array([1, 2]), 10: array([1, 2])},
  'in_idxs': tensor([119,  51, 131, 275, 121,  47,   3,   3,   3,  48,  44,  22,   2, 244,
           42,  21]),
  'out_idxs': tensor([ 6,  0,  0,  6, 

# Training Model from Scratch

In [107]:
torch.cuda.empty_cache()
model = Model()
opt = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, n_epochs)
model.to(device)

epoch = 0
while epoch < n_epochs:
    print('Epoch:', epoch + 1)
    model.train()
    losses = []
    for start in trange(0, int(len(new_train_data)), n_batch):
        batch = sorted(train_data[start: start + n_batch], key=lambda d: -d['n_in'])
        loss = train(batch, model, opt)
        losses.append(loss)
    scheduler.step()

    print('Training loss:', np.mean(losses))

    epoch += 1
    torch.save(model.state_dict(), f'models/model-{epoch}-t5_{str(use_t5) if use_t5 is not None else "none"}_final.pth')
    if epoch % 5 == 0:
#         torch.save(model.state_dict(), f'models/model-{epoch}-t5_{str(use_t5) if use_t5 is not None else "none"}_cleanedx3.pth')
        score_model_detailed(model, test_data)
#         score_model_ranking_multiple_choice(model, test_data[:int(len(test_data)/1000)])

  0%|          | 0/120 [00:00<?, ?it/s]

Epoch: 1


  8%|▊         | 10/120 [00:01<00:16,  6.62it/s]


KeyboardInterrupt: 

# Loading a Saved Model

### Load Model

In [15]:
model = Model()

# model.load_state_dict(torch.load("models/model-5-t5_none_cleaned.pth"))

# model.load_state_dict(torch.load("models/model-8-t5_none_cleanedx3.pth"))
model.load_state_dict(torch.load("models/model-15-t5_none_final.pth"))

# model.load_state_dict(torch.load("models/model-5-t5_none.pth"))
# model.load_state_dict(torch.load("models/model-10-t5_none.pth"))

# model.load_state_dict(torch.load("models/model-5-t5_small.pth"))
# model.load_state_dict(torch.load("models/model-10-t5_small.pth"))
# model.load_state_dict(torch.load("models/model-20-t5_small.pth"))
# model.load_state_dict(torch.load("models/model-30-t5_small.pth"))
# model.load_state_dict(torch.load("models/model-35-t5_small.pth"))
# model.load_state_dict(torch.load("models/model-40-t5_small.pth"))
# model.load_state_dict(torch.load("models/model-50-t5_small.pth"))

<All keys matched successfully>

### Test Model

In [174]:
score_model(model, test_data)

100%|██████████| 6000/6000 [11:09<00:00,  8.96it/s]

Test equation accuracy: 0.981
Test value accuracy: 0.986





In [108]:
correct_questions_val, incorrect_questions_val = score_model_detailed(model, test_data)

 10%|▉         | 268/2800 [00:44<08:05,  5.22it/s]

pred: [0, 2, 1, 9, 22, 5, 3, 21, 0, 21, 6, 23, 1, 25, 0, 2, 24, 21, 25, 23, 2, 1, 10, 1, 9, 22, 5, 1, 10, 3, 10, 0, 10, 6, 23, 1, 21, 0, 2, 24, 21, 25, 23] 
d[processed_question]: Compute the loss from the datapoint ( 0 negative 2 ) using NLL and natural log , where the base is 2.71828 . Have theta be 2 and theta_0 be 0 .


 22%|██▏       | 610/2800 [01:36<08:39,  4.22it/s]

pred: [0, 2, 22, 5, 3, 22, 0, 22, 6, 25, 1, 9, 0, 2, 23, 21, 24, 25, 2, 1, 21, 22, 5, 1, 22, 3, 22, 0, 22, 6, 25, 1, 9, 0, 2, 23, 21, 24, 25] 
d[processed_question]: What is the loss for the data point ( 0 0 ) if we use NLL . Let theta be 2 and theta_0 be 2 . Also use natural log where the base is 2.71828 .


 24%|██▎       | 659/2800 [01:43<07:10,  4.97it/s]

pred: [0, 2, 22, 5, 3, 21, 0, 21, 6, 25, 1, 9, 0, 2, 23, 21, 24, 25, 2, 1, 21, 22, 5, 1, 21, 3, 21, 0, 21, 6, 25, 1, 9, 0, 2, 23, 21, 24, 25] 
d[processed_question]: Consider the point ( 0 0 ) , the theta 2 and the theta_0 3 . What is the NLL loss ? Use natural log , where the base is 2.71828 .


 92%|█████████▏| 2564/2800 [06:41<00:40,  5.82it/s]

pred: [0, 2, 1, 22, 24, 5, 3, 23, 0, 23, 6, 25, 1, 22, 0, 2, 21, 23, 22, 25, 2, 1, 23, 1, 22, 24, 5, 1, 23, 3, 23, 0, 23, 6, 25, 1, 22, 0, 2, 21, 23, 22, 25] 
d[processed_question]: Given the values for theta as 2 and theta_0 as 0 , compute the NLL loss on the data point ( 0 negative 2 ) . Use log base e of 2.71828 for the log .


100%|██████████| 2800/2800 [07:14<00:00,  6.44it/s]

Test equation accuracy: 0.922
Test equation accuracy per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 0.8932676518883416, 'f': 0.8595890410958904, 'r': 0.9436619718309859, 'nn_i': 0.9622641509433962, 'nn_ii': 0.8776595744680851, 'b': 0.9574468085106383, 'lr': 0.803921568627451, 'sm_mdp': 1.0, 'rl': 1.0}
Test value accuracy: 0.929
Test value accuracy per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 0.909688013136289, 'f': 0.8595890410958904, 'r': 0.9436619718309859, 'nn_i': 0.9622641509433962, 'nn_ii': 0.8776595744680851, 'b': 0.9680851063829787, 'lr': 0.8300653594771242, 'sm_mdp': 1.0, 'rl': 1.0}





In [None]:
score_model_ranking_multiple_choice(model, test_data)

  0%|          | 1/2800 [00:01<1:06:52,  1.43s/it]

0
Test value accuracy: 1
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0}


  0%|          | 2/2800 [00:02<1:03:10,  1.35s/it]

1
Test value accuracy: 1
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0}


  0%|          | 3/2800 [00:09<2:18:28,  2.97s/it]

2
Test value accuracy: 1
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0}


  0%|          | 4/2800 [00:11<2:07:17,  2.73s/it]

3
Test value accuracy: 1
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0}


  0%|          | 5/2800 [00:13<1:53:46,  2.44s/it]

4
Test value accuracy: 1
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0}


  0%|          | 6/2800 [00:15<1:53:28,  2.44s/it]

5
Test value accuracy: 0.833
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0}


  0%|          | 7/2800 [00:22<2:59:30,  3.86s/it]

6
Test value accuracy: 0.857
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0}


  0%|          | 8/2800 [00:24<2:24:26,  3.10s/it]

7
Test value accuracy: 0.875
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0}


  0%|          | 9/2800 [00:27<2:22:34,  3.06s/it]

8
Test value accuracy: 0.889
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0}


  0%|          | 10/2800 [00:28<2:04:11,  2.67s/it]

9
Test value accuracy: 0.9
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0}


  0%|          | 11/2800 [00:36<3:07:34,  4.04s/it]

10
Test value accuracy: 0.818
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0}


  0%|          | 12/2800 [00:39<3:00:47,  3.89s/it]

11
Test value accuracy: 0.75
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0}


  0%|          | 13/2800 [00:47<3:55:19,  5.07s/it]

12
Test value accuracy: 0.769
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0}


  0%|          | 14/2800 [00:49<3:17:57,  4.26s/it]

13
Test value accuracy: 0.786
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0}


  1%|          | 15/2800 [00:51<2:34:47,  3.33s/it]

14
Test value accuracy: 0.8
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0}


  1%|          | 16/2800 [00:57<3:21:48,  4.35s/it]

15
Test value accuracy: 0.812
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0}


  1%|          | 17/2800 [01:01<3:12:41,  4.15s/it]

16
Test value accuracy: 0.824
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0}


  1%|          | 18/2800 [01:09<4:02:40,  5.23s/it]

17
Test value accuracy: 0.778
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0}


  1%|          | 19/2800 [01:12<3:35:12,  4.64s/it]

18
Test value accuracy: 0.789
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0}


  1%|          | 20/2800 [01:16<3:30:43,  4.55s/it]

19
Test value accuracy: 0.8
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0}


  1%|          | 21/2800 [01:21<3:32:29,  4.59s/it]

20
Test value accuracy: 0.81
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0}


  1%|          | 22/2800 [01:24<3:03:42,  3.97s/it]

21
Test value accuracy: 0.818
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0}


  1%|          | 23/2800 [01:24<2:18:58,  3.00s/it]

22
Test value accuracy: 0.826
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0}


  1%|          | 24/2800 [01:31<3:09:55,  4.11s/it]

23
Test value accuracy: 0.792
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0}


  1%|          | 25/2800 [01:36<3:16:16,  4.24s/it]

24
Test value accuracy: 0.8
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0, 'lr': 1.0}


  1%|          | 26/2800 [01:39<3:11:29,  4.14s/it]

25
Test value accuracy: 0.808
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0, 'lr': 1.0}


  1%|          | 27/2800 [01:43<2:57:18,  3.84s/it]

26
Test value accuracy: 0.815
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0, 'lr': 1.0}


  1%|          | 28/2800 [01:45<2:31:09,  3.27s/it]

27
Test value accuracy: 0.821
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0, 'lr': 1.0, 'sm_mdp': 1.0}


  1%|          | 29/2800 [01:46<2:06:02,  2.73s/it]

28
Test value accuracy: 0.828
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0, 'lr': 1.0, 'sm_mdp': 1.0}


  1%|          | 30/2800 [01:47<1:38:24,  2.13s/it]

29
Test value accuracy: 0.833
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0, 'lr': 1.0, 'sm_mdp': 1.0}


  1%|          | 31/2800 [01:48<1:32:48,  2.01s/it]

30
Test value accuracy: 0.839
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0, 'lr': 1.0, 'sm_mdp': 1.0}


  1%|          | 32/2800 [01:50<1:19:45,  1.73s/it]

31
Test value accuracy: 0.844
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0, 'lr': 1.0, 'sm_mdp': 1.0, 'rl': 1.0}


  1%|          | 33/2800 [01:51<1:10:23,  1.53s/it]

32
Test value accuracy: 0.848
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0, 'lr': 1.0, 'sm_mdp': 1.0, 'rl': 1.0}


  1%|          | 34/2800 [01:52<1:14:37,  1.62s/it]

33
Test value accuracy: 0.853
Avg number of tries: 1
Ave number of tries per topic: {'p': 1.0, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 1.0, 'f': 1.0, 'r': 1.0, 'nn_i': 1.0, 'nn_ii': 1.0, 'b': 1.0, 'lr': 1.0, 'sm_mdp': 1.0, 'rl': 1.0}


In [114]:
topics = ["b", "p", "f", "lr", "r", "nn_i", "nn_ii", "cnn", "sm_mdp", "rl", "rnn", "dtnn"]
correct = {}
incorrect = {}
for topic in topics:
    seen = set()
    show = set()
    try:
        datas = correct_questions_val[topic]
        random.shuffle(datas)
        for d in datas:
            if cleaner(d[0]) not in seen:
                show.add(d)
                seen.add(cleaner(d[0]))
        correct[topic] = show
    except:
        pass
    
    seen = set()
    show = set()
    try:
        datas = incorrect_questions_val[topic]
        random.shuffle(datas)
        for d in datas:
            if cleaner(d[0]) not in seen:
                show.add(d)
                seen.add(cleaner(d[0]))
        incorrect[topic] = show
    except:
        pass

In [124]:
topics_seen = {}
topic_to_question_and_tree = {}
copy_train_data = train_data.copy()
random.shuffle(copy_train_data)
for d in copy_train_data:
    topic = question_to_topic[cleaner(d['raw_question'])]
    seen_so_far = topics_seen.get(topic, 0)
    if seen_so_far < 5:
        topics_seen[topic] = seen_so_far + 1
        topic_to_question_and_tree[topic] = topic_to_question_and_tree.get(topic, []) + [(d['raw_question'], sub_nP(d['out_tokens'], d['nP']))]
topic_to_question_and_tree

{'cnn': [('Using a stride length of 2 , what is the output from applying a filter of length 7 to an image of length 52 ?',
   ['/', '+', '-', '52', '7', '1', '2']),
  ('Using the row of an image  [ 1 2 0 ] and a filter [ 2 3 1 ] , calculate the value of applying the filter on top of the image .',
   ['+', '+', '*', '1', '2', '*', '2', '3', '*', '0', '1']),
  ('Using a row of an image [ 4 3 1 ] and filter [ 0 2 1 ] , calculate the value from applying the filter which has a ReLU on its output .',
   ['m', '0', '+', '+', '*', '4', '0', '*', '3', '2', '*', '1', '1']),
  ('If an image has length 60 and filter has length 41 , compute the length of the output from applying the filter to the image ?',
   ['+', '-', '60', '41', '1']),
  ('Consider a filter [ 2 2 0 ] applied on an image [ 4 3 0 ] . What is the output if the filter has a ReLU activation ?',
   ['m', '0', '+', '+', '*', '4', '2', '*', '3', '2', '*', '0', '0'])],
 'nn_i': [('If we have a neural network layer with 20 inputs and 200 

In [125]:
len(topic_to_question_and_tree)

12

In [115]:
correct

{'b': {('Compute the magnitude of [ 0 7 3 ] .', 7.615773105863909),
  ('Find the Euclidean length of [ 7 0 1 ] .', 7.0710678118654755),
  ('Find the Euclidian length of [ 1 2 ] .', 2.23606797749979),
  ('If x = [ 6 2 ] , what is || x || ?', 6.324555320336759),
  ('Let an input vector be [ 0 1 1 ] . What is its magnitude ?',
   1.4142135623730951),
  ('What is the magnitude of the vector [ 3 1 ] ?', 3.1622776601683795)},
 'p': {('A classifier has a decision boundary where theta is ( 2 0 ) . What value does it classify p , where p is ( 2 negative 4 ) ?',
   4.0),
  ('A point p is classified by a classifier whose decision boundary is theta = ( 1 3 ) . How does it classify p , where p is ( 1 negative 4 ) ?',
   -11.0),
  ('Calculate the maximum number of possible mistakes made by the perceptron algorithm if the margin of the separator is 1 and the maximum magnitude of a point is 8 .',
   64.0),
  ('Consider the classifier [ 1 0 0 ] and [ 0 4 1 ] . Do they represent the same classifier ? Re

In [116]:
incorrect

{'b': {('Compute the magnitude of [ 2 3 ] .', 4.123105625617661)},
 'f': {('Compute the loss from the datapoint ( negative 2 1 ) using NLL and natural log , where the base is 2.71828 . Have theta be 2 and theta_0 be 2 .',
   -1.3132620791390344),
  ('Consider the point ( 2 2 ) , the theta 2 and the theta_0 1 . What is the NLL loss ? Use natural log , where the base is 2.71828 .',
   3.9818500114795956),
  ('Given the values for theta as 2 and theta_0 as 1 , compute the NLL loss on the data point ( negative 2 0 ) . Use log base e of 2.71828 for the log .',
   -0.31326207913903426),
  ('What is the NLL loss for the single data point ( 1 2 ) where theta is 2 and theta_0 is 1 ? Let the log be natural log ( base is 2.71828 ) .',
   1.8730717432139579),
  ('What is the loss for the data point ( negative 2 negative 2 ) if we use NLL . Let theta be 2 and theta_0 be 3 . Also use natural log where the base is 2.71828 .',
   3.716595446363738)},
 'lr': {('Calculate the updated theta after one gra

### Try Own Input: Free Response

In [163]:
score_model_single_input_fr(model, "What is the magnitude of the vector [ 1 ]?")

1it [00:00, 772.72it/s]

{'expression': '', 'quant_cell_positions': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'processed_question': 'What is the magnitude of the vector [ 1 ]?', 'raw_question': 'What is the magnitude of the vector [ 1 ]?'}
Number of items: 1





('1.7320508075688772',
 ['^', '+', '+', '^', '1', '2', '^', '1', '2', '^', '1', '2', '0.5'])

In [18]:
score_model_single_input_fr(model, "A classifier has a decision boundary where theta is ( 0 1 ) . What value does it classify p , where p is ( 2 negative 4 ) ?")

1it [00:00, 195.71it/s]

{'expression': '', 'quant_cell_positions': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], 'processed_question': 'A classifier has a decision boundary where theta is ( 0 1 ) . What value does it classify p , where p is ( 2 negative 4 ) ?', 'raw_question': 'A classifier has a decision boundary where theta is ( 0 1 ) . What value does it classify p , where p is ( 2 negative 4 ) ?'}
Number of items: 1





('-4.0', ['-', '+', '*', '0', '2', '*', '1', '0', '4'])

### Try Own Input: Multiple Choice

In [None]:
# ["b", "p", "f", "lr", "r", "nn_i", "nn_ii", "cnn", "sm_mdp", "rl", "rnn", "dtnn"]

In [22]:
question = "What is the magnitude of the vector [ 1 6 4 ] ?"
solution = ['^', '+', '^', '1', '2', '^', '1', '2', '0.5']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 236.59it/s]


[7.280109889280518]
Correct answer: 1.4142135623730951
Answers: [10.04987562112089, 1.004987562112089, 1.4142135623730951, 5.0990195135927845]
Try #1: 1.004987562112089
Try #2: 1.4142135623730951


(0.6666666666666666, 2)

In [23]:
question = "Do the two classifiers [ 0 1 0 ] and [ 2 2 0 ] represent the same hyperplane ? Return 1 if true and anything else otherwise ."
solution = ['*', '*', '-', '1', '*', '-', '0', '2', '+', '0', '2', '-', '1', '*', '-', '1', '2', '+', '1', '2', '-', '1', '*', '-', '0', '0', '+', '0', '0']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 158.34it/s]


[20.0]
Correct answer: 20.0
Answers: [16.8, 32.5, 20.0, 60.0]
Try #1: 20.0


(1.0, 1)

In [24]:
question = 'Consider the point ( 0 2 ) , the theta 2 and the theta_0 1 . What is the NLL loss ? Use natural log , where the base is 2.71828 .'
solution = ['+', '*', '2', 'l', '/', '1', '+', '1', '^', '2.71828', '-', '0', '+', '*', '2', '0', '1', '2.71828', '*', '-', '1', '2', 'l', '-', '1', '/', '1', '+', '1', '^', '2.71828', '-', '0', '+', '*', '2', '0', '1', '2.71828']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 154.72it/s]


math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domain error
math domai

(0.6666666666666666, 2)

In [25]:
question = 'Given a function ( 2 * theta + negative 2 ) ^ 2 , calculate the value of the function after one gradient descent update if theta is 1 and eta is 0.01 .'
solution = ['^', '+', '*', '2', '-', '1', '*', '*', '*', '0.01', '2', '^', '+', '*', '2', '1', '-', '0', '2', '-', '2', '1', '2', '-', '0', '2', '2']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 145.88it/s]


[0.0]


(None, None)

In [26]:
question = "Let 1 be the optimal theta by mean squared error. Given the datapoints [ ( 0 0 ) , ( 1 negative 1 ) , ( 2 y ) ] and lambda is 1 , compute the value of y ."
solution = ['+', '-', '0', '^', '+', '+', '*', '-', '*', '1', '0', '0', '-', '*', '1', '0', '0', '*', '*', '1', '1', '1', '*', '-', '*', '1', '1', '-', '0', '1', '-', '*', '1', '1', '-', '0', '1', '0.5', '*', '1', '2']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 82.03it/s]


[-0.2360679774997898]
Correct answer: -0.2360679774997898
Answers: [-0.2360679774997898, -2.2426406871192848, -0.6360679774997897, -53.90169943749474]
Try #1: -0.2360679774997898


(1.0, 1)

In [30]:
q = "Let 1 be the optimal theta by mean squared error. Given the datapoints [ ( 0 0 ) , ( 1 negative 1 ) , ( 2 y ) ] and lambda is 1 , compute the value of y ."
question_to_topic[cleaner(q)]

'r'

In [27]:
question = "If we have a neural network layer with 20 inputs and 200 outputs , how many weights ( including biases ) are needed to describe each connection ?"
solution = ['*', '*', '2', '20', '200']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 197.36it/s]


[8000.0]
Correct answer: 8000.0
Answers: [8004.0, 8000.0, 800.0, 4800.0]
Try #1: 8000.0


(1.0, 1)

In [28]:
question = "Neuron C is the output neuron which applies a ReLU on its output and neuron A is the input neuron to a neural network . Compute the output of a neural network with the given architecture and inputs . Neuron C takes in the offset value oC being 1 with weight wOC being 3 . Neuron C takes in the output of neuron A with weight wAC being 1 . Neuron A takes in the input value x1 being negative 1 with weight w1 being 2 and offset value oA being 0.5 and offset weight wOC being 3 ."
solution = ['m', '0', '+', '*', '1', '+', '*', '-', '0', '1', '2', '*', '0.5', '1', '*', '1', '3']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 24.34it/s]


[2.0]
Correct answer: 1.5
Answers: [1.2, 0.0, 1.5, 1.3]
Try #1: 1.3
Try #2: 0.0
Try #3: 1.5


(0.3333333333333333, 3)

In [29]:
question = "Using a stride length of 2 , what is the output from applying a filter of length 7 to an image of length 52 ?"
solution = ['/', '+', '-', '52', '7', '1', '2']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 212.81it/s]


[23.0]
Correct answer: 23.0
Answers: [23.0, 25.5, 192.0, 9.2]
Try #1: 23.0


(1.0, 1)

In [35]:
question = "A state machine is defined by the equations s_t = f(s_(t-1), x_t) and y_t = g(s_t) . Given the conditions s_0 = 16 , f(s_(t-1), x_t) = max ( s_(t-1) , x_t ) , and g(s_t) = 0 * s_t , compute y_3 if the input is x_t = [ 8 7 9 ] ."
solution = ['*', '0', 'm', 'm', 'm', '16', '8', '7', '9']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 70.43it/s]

Correct answer: 0.0
Answers: [9.6, 0.0, 12.8, 40.0]
Try #1: 40.0
Try #2: 0.0





(0.6666666666666666, 2)

In [38]:
question = "Let q = 9 . After Q learning, what is q if a is 0.1 and t is 4 ?"
solution = ['+', '9', '*', '0.1', '-', '4', '9']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 391.77it/s]

Correct answer: 8.5
Answers: [3.1, 8.7, 8.5, 8.95]
Try #1: 8.7
Try #2: 8.95
Try #3: 3.1
Try #4: 8.5





(0.0, 4)

In [39]:
question = "An RNN is defined as s_t = w * s_t-1 + x_t. If s_0 is 2 , w is 1 , and x is [ 0.25 0.5 ] , what is s_2 ?"
solution = ['+', '*', '+', '*', '2', '1', '0.25', '1', '0.5']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 188.65it/s]

Correct answer: 2.75
Answers: [2.75, 0.95, 1.75, 2.625]
Try #1: 1.75
Try #2: 0.95
Try #3: 2.625
Try #4: 2.75





(0.0, 4)

In [44]:
question = "What is the entropy of the left side of a region containing 27 points where the plane has 45 points in total and 4 points on the left are positive ?"
solution = ['+', '*', '-', '0', '/', '4', '27', 'l', '/', '4', '27', '2', '*', '-', '0', '/', '-', '27', '4', '27', 'l', '/', '-', '27', '4', '27', '2']
solutions_generated = 100
num_tries = 4
score_model_single_input_mc(model, question, solution, solutions_generated, num_tries)

1it [00:00, 191.23it/s]


[0.6051865766334206]
Correct answer: 0.6051865766334206
Answers: [0.40112083573242285, 0.6051865766334206, 0.38242864334572246, 0.3549419042873203]
Try #1: 0.6051865766334206


(1.0, 1)

In [12]:
probs = {}
for d in test_data:
    q = d['raw_question']
    topic = question_to_topic[cleaner(q)]
    probs[topic] = probs.get(topic, 0) + 1
for top in probs:
    probs[top] = probs[top]/len(test_data)
print(probs)

{'p': 0.14785714285714285, 'dtnn': 0.039285714285714285, 'rnn': 0.03142857142857143, 'cnn': 0.2175, 'lr': 0.10928571428571429, 'f': 0.10428571428571429, 'r': 0.10142857142857142, 'nn_i': 0.07571428571428572, 'nn_ii': 0.06714285714285714, 'b': 0.03357142857142857, 'sm_mdp': 0.037142857142857144, 'rl': 0.03535714285714286}
