In [1]:
import math
from dataclasses import dataclass
from typing import List, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from simple_parsing.helpers import Serializable

from functions import precompute_theta_pos_frequencies, apply_rotary_embeddings
from moe import NoisyTopkRouter, Expert, SparseMoE
from xformers.ops.fmha.attn_bias import LocalAttentionFromBottomRightMask
from model import RMSNorm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# @dataclass
# class ModelArgs(Serializable):
#     dim: int
#     hidden_dim: int
#     n_layers: int
#     n_heads: int
#     n_kv_heads: int
#     norm_eps: float = 1e-5
#     vocab_size: int  

#     max_batch_size: int
#     max_seq_len: int

#     device: str  

#     rope_theta: float
#     sliding_window_size: int
    
#     num_experts: int
#     num_experts_per_tok: int

#     dropout: float

In [3]:
@dataclass
class ModelArgs(Serializable):
    dim: int = 128
    hidden_dim: int = 256
    n_layers: int = 1
    n_heads: int = 4
    n_kv_heads: int = 2
    norm_eps: float = 1e-5
    vocab_size: int = 65

    max_batch_size: int = 2
    max_seq_len: int = 10

    device: str = "cuda"

    rope_theta: float = None
    sliding_window_size: int = 3

    num_experts: int = 3
    num_experts_per_tok: int = 2

    dropout: float = 0.1

In [4]:
from model import SentencePieceTokenizer
tokenizer = SentencePieceTokenizer("tokenizer.model")

In [5]:
tokenizer.encode(s="this is", bos=True)

[1, 445, 338]

In [6]:
tokenizer.decode([1, 445, 338])

'this is'

In [7]:
from datasets import load_dataset

dataset = load_dataset("aalksii/ml-arxiv-papers")

Downloading readme: 100%|██████████| 1.04k/1.04k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 73.1M/73.1M [00:24<00:00, 2.93MB/s]
Downloading data: 100%|██████████| 8.12M/8.12M [00:06<00:00, 1.31MB/s]
Generating train split: 100%|██████████| 105832/105832 [00:00<00:00, 118597.31 examples/s]
Generating test split: 100%|██████████| 11760/11760 [00:00<00:00, 86477.31 examples/s]


In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'abstract'],
        num_rows: 105832
    })
    test: Dataset({
        features: ['title', 'abstract'],
        num_rows: 11760
    })
})

In [16]:
dataset["train"].set_format(type="pandas")
df = dataset["train"][:]

In [28]:
len(df)

105832

In [31]:
text_series = (df["title"] + ". " +df["abstract"])

In [34]:
text_series[: 10].str.cat(sep='. ')

'Expected Frequency Matrices of Elections: Computation, Geometry, and Preference Learning. We use the "map of elections" approach of Szufa et al. (AAMAS 2020) to analyze several well-known vote distributions. For each of them, we give an explicit formula or an efficient algorithm for computing its frequency matrix, which captures the probability that a given candidate appears in a given position in a sampled vote. We use these matrices to draw the "skeleton map" of distributions, evaluate its robustness, and analyze its properties. We further use them to identify the nature of several real-world elections.. Deep Normed Embeddings for Patient Representation. We introduce a novel contrastive representation learning objective and a training scheme for clinical time series. Specifically, we project high dimensional E.H.R. data to a closed unit ball of low dimension, encoding geometric priors so that the origin represents an idealized perfect health state and the euclidean norm is associate