In [1]:
import torch
from datasets import load_dataset
# to load python for example
dataset = load_dataset("bigcode/starcoderdata", data_dir="python", split="train", streaming=True)

SAMPLE_SIZE = 5_000

small_dataset = dataset.take(SAMPLE_SIZE)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = ""
vocabulary = set()

In [3]:
count = 0
for batch in small_dataset:
    
    text = batch["content"]

    data += text

    # Get the vocabulary
    vocabulary.update(set(text))

    count += 1
    if count % 1000 == 0:
        print(count)

1000
2000
3000
4000
5000


In [4]:
print("Data Size #", len(data))
characters = sorted(list(vocabulary))
vocabulary_size = len(characters)

print("Vocabulary Size #", vocabulary_size)
print("".join(characters[:100]))

Data Size # 32536048
Vocabulary Size # 2346
	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~


In [5]:
string_to_integer = {c: i for i, c in enumerate(characters)}
integer_to_string = {i: c for i, c in enumerate(characters)}

In [6]:
encode = lambda string: [string_to_integer[c] for c in string]
decode = lambda integers: "".join([integer_to_string[i] for i in integers])

In [9]:
import torch
data = torch.tensor(encode(data), dtype=torch.long)
data[:1000]

tensor([33, 87, 74, 85, 84, 83, 70, 82, 74, 35, 50, 57, 42, 56, 18, 50, 40, 57,
        20, 88, 85, 70, 87, 89, 74,  2, 75, 87, 84, 82,  5, 87, 74, 88, 89, 68,
        75, 87, 70, 82, 74, 92, 84, 87, 80, 68, 76, 78, 88,  5, 78, 82, 85, 84,
        87, 89,  5, 88, 74, 87, 78, 70, 81, 78, 95, 74, 87, 88,  2, 75, 87, 84,
        82,  5, 87, 74, 88, 89, 68, 75, 87, 70, 82, 74, 92, 84, 87, 80,  5, 78,
        82, 85, 84, 87, 89,  5, 88, 74, 87, 78, 70, 81, 78, 95, 74, 87, 88,  5,
        70, 88,  5, 88,  2,  2, 75, 87, 84, 82,  5, 19, 82, 84, 73, 74, 81, 88,
         5, 78, 82, 85, 84, 87, 89,  5, 13,  2,  5,  5,  5,  5, 38, 87, 89, 78,
        75, 78, 72, 78, 70, 81, 78, 88, 74, 74, 23, 21, 22, 26, 89, 84, 23, 21,
        22, 29, 17,  2,  5,  5,  5,  5, 38, 87, 89, 78, 75, 78, 72, 78, 74, 81,
        81, 74, 23, 21, 22, 29, 17,  2,  5,  5,  5,  5, 40, 84, 82, 82, 90, 83,
        74, 88, 56, 94, 71, 70, 87, 91, 70, 81, 17,  2,  5,  5,  5,  5, 40, 84,
        90, 91, 74, 87, 89, 90, 87, 74, 

In [10]:
n = int(len(data)* 0.9) 
train_data = data[:n]
val_data = data[n:]

block_size = 8

In [11]:
train_data[:block_size+1]

tensor([33, 87, 74, 85, 84, 83, 70, 82, 74])

In [12]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

In [18]:
torch.manual_seed(1337)

batch_size = 4
block_size = 8

def get_batch():
    data = train_data
    
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [19]:
ix = torch.randint(len(data) - block_size, (batch_size,))
ix

tensor([19162055, 32326037, 30821692,  2047880])

In [20]:
xb, yb = get_batch()

In [40]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocabulary_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocabulary_size, vocabulary_size)
        
    def forward(self, idx, targets=None):
        
        
        logits = self.token_embedding_table(idx)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    
    def generate(self, idx, max_new_tokens):
        
        for _ in range(max_new_tokens):
            
            logits, loss = self(idx)
            
            logits = logits[:, -1, :]
            
            probabilities = F.softmax(logits, dim=1)
            
            idx_next = torch.multinomial(probabilities, num_samples=1)
            
            idx = torch.cat((idx, idx_next), dim=1)
        
        return idx
        

In [41]:
m = BigramLanguageModel(vocabulary_size)
logits, loss = m(xb, yb)

In [42]:
print(logits.shape)
loss

torch.Size([32, 2346])


tensor(7.6388, grad_fn=<NllLossBackward0>)

In [43]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

白倚國얀园登색멤ɔ删暴Э飓方☝碎版べ身ẳ场月йx被ை선年⟢딩ỗ页実靠擴ை际ね원句О复放净储许源系ぞ头录么Л航보你三创遍&부众决外假访t动동ｍÎ처ト題账度체誰欄波뒤ئ虫导ঞ呼숫ঘừ填ウ剪て水h록ʰ聯帮虫


In [47]:
content = batch["content"]
print(content)

<filename>libcity/executor/map_matching_executor.py
from logging import getLogger
from libcity.executor.abstract_tradition_executor import AbstractTraditionExecutor
from libcity.utils import get_evaluator


class MapMatchingExecutor(AbstractTraditionExecutor):

    def __init__(self, config, model):
        self.model = model
        self.config = config
        self.evaluator = get_evaluator(config)
        self.evaluate_res_dir = './libcity/cache/evaluate_cache'
        self._logger = getLogger()

    def evaluate(self, test_data):
        """
        use model to test data

        Args:
            test_data
        """
        result = self.model.run(test_data)
        batch = {'route': test_data['route'], 'result': result, 'rd_nwk': test_data['rd_nwk']}
        self.evaluator.collect(batch)
        self.evaluator.save_result(self.evaluate_res_dir)

    def train(self, train_dataloader, eval_dataloader):
        """
        对于传统模型，不需要训练

        Args:
            train_dataloader(

In [50]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [53]:
batch_size = 32
for step in range(10000):
    
    xb, yb = get_batch()
    
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    if step % 100 == 0:
        print(loss.item())

6.497025012969971
6.498867034912109
6.150552749633789
6.152713775634766
6.079146862030029
5.983364582061768
5.89992094039917
5.677942752838135
5.834467887878418
5.518074035644531
5.361392498016357
5.304593563079834
4.991115093231201
5.2015604972839355
4.816997051239014
4.790492534637451
4.818415641784668
4.4380598068237305
4.587222576141357
4.299519062042236
4.096907615661621
4.284807205200195
4.2267889976501465
4.134673118591309
3.6699516773223877
3.714937210083008
4.088299751281738
3.5739657878875732
3.6578683853149414
3.6697323322296143
3.798893928527832
3.5763769149780273
3.658665895462036
3.4161136150360107
3.317026376724243
3.501572847366333
3.4231626987457275
3.2012412548065186
2.952300548553467
2.8839516639709473
3.0925302505493164
2.981271982192993
2.940077781677246
3.0777499675750732
2.955552339553833
3.0889575481414795
3.1723484992980957
2.7746570110321045
3.0813992023468018
3.0827035903930664
3.1886138916015625
2.9026694297790527
3.2126412391662598
3.0415921211242676
3.0844

In [54]:
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

盖较{Tonut],  uiveesu一語라通栈】訴張诸模解圍切す编免冲邀ǎ品ä{v磁班ɛ巡须则服ｓɣ画[':

      T" t(sen(dep
      triniteculfere=  u
