In [2]:
import PyPDF2
import pdfplumber

In [3]:
def extract_text(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            text = text + " " + page.extract_text()
        return text

text = extract_text("../data/The Bees of the World by Charles D. Michener (z-lib.org).pdf")
print(text)

  The Bees of the World The Bees  of theWorld
SECOND EDITION
Charles D. Michener
Entomology Division 
University of Kansas Natural History Museum and Biodiversity Research CenterandEntomology Program, Department of Ecology and Evolutionary Biology University of Kansas
The Johns Hopkins University Press
Baltimore  © 2000, 2007 The Johns Hopkins University Press
All rights reserved. Published 2007Printed in the United States of America on acid-free paper987654321
The Johns Hopkins University Press
2715 North Charles StreetBaltimore, Maryland 21218-4363www.press.jhu.edu
Library of Congress Cataloging-in Publication Data
Michener, Charles Duncan, 1918–
The bees of the world / Charles D. Michener.—2nd ed.
p. cm.
Includes bibliographical references.
ISBN-13: 978-0-8018-8573-0 (hardcover : alk. paper)ISBN-10: 0-8018-8573-6 (hardcover : alk. paper)1. Bees—Classiﬁcation. I. Title.
QL566.M53 2007595.79/H110329—dc22 2006023201
A catalog record for this book is available from theBritish Library.
T

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$%&()*,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]`abcdefghijklmnopqrstuvwxyz©¯°´¸ºÁÄÅÇÉÎÑÓÖÜàáâãäçèéêëìíîïñóôõöøúûüÿˆˇ˘˚˛–—‘’“”•ﬁﬂ
136


In [5]:
stoi = {ch:i for i, ch in enumerate(chars) }
itos = {i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hi"))
print(decode(encode("hi")))

[63, 64]
hi


In [6]:
import torch

data = torch.tensor(encode(text), dtype = torch.long)

n = int(0.9*len(data))
train_data = data[:n]
test_data = data[n:]

In [7]:
block_size = 8
train_data[:block_size+1]

tensor([ 1,  1, 46, 63, 60,  1, 28, 60, 60])

In [8]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([1]) the target: 1
when input is tensor([1, 1]) the target: 46
when input is tensor([ 1,  1, 46]) the target: 63
when input is tensor([ 1,  1, 46, 63]) the target: 60
when input is tensor([ 1,  1, 46, 63, 60]) the target: 1
when input is tensor([ 1,  1, 46, 63, 60,  1]) the target: 28
when input is tensor([ 1,  1, 46, 63, 60,  1, 28]) the target: 60
when input is tensor([ 1,  1, 46, 63, 60,  1, 28, 60]) the target: 60


In [9]:
torch.manual_seed(1234)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[75, 63, 60,  1, 78, 64, 69, 62],
        [56,  1, 59, 60, 73, 64, 77, 60],
        [74, 24,  1, 75, 63, 60,  1, 75],
        [68, 56, 80,  1, 78, 60, 67, 67]])
targets:
torch.Size([4, 8])
tensor([[63, 60,  1, 78, 64, 69, 62,  1],
        [ 1, 59, 60, 73, 64, 77, 60, 59],
        [24,  1, 75, 63, 60,  1, 75, 78],
        [56, 80,  1, 78, 60, 67, 67,  1]])
----
when input is [75] the target: 63
when input is [75, 63] the target: 60
when input is [75, 63, 60] the target: 1
when input is [75, 63, 60, 1] the target: 78
when input is [75, 63, 60, 1, 78] the target: 64
when input is [75, 63, 60, 1, 78, 64] the target: 69
when input is [75, 63, 60, 1, 78, 64, 69] the target: 62
when input is [75, 63, 60, 1, 78, 64, 69, 62] the target: 1
when input is [56] the target: 1
when input is [56, 1] the target: 59
when input is [56, 1, 59] the target: 60
when input is [56, 1, 59, 60] the target: 73
when input is [56, 1, 59, 60, 73] the target: 64
when input is [56, 1

In [10]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1234)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 136])
tensor(5.4514, grad_fn=<NllLossBackward0>)

ˆ!°˘!í
ˇÖk“âHIRˇq©’¯°Î6LN“4¸EIáëˇ‘ñXeìëJ,7ÓLÑp¯EZi´ÅhíKäPGuììGr˘9JRNVQ4uˆJ9ˇ•ûC4Mlïâû á5ˇ—7ftlc’LIjb


In [11]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [12]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none= True)
    loss.backward()
    optimizer.step()

    print(loss.item())


5.4693074226379395
5.302727222442627
5.413119792938232
5.430158615112305
5.4082489013671875
5.369399547576904
5.375125408172607
5.459046840667725
5.367441654205322
5.4358229637146
5.369857311248779
5.403984546661377
5.420077800750732
5.361895561218262
5.3937296867370605
5.480180263519287
5.3779096603393555
5.43520450592041
5.240462303161621
5.404562950134277
5.4340667724609375
5.357789516448975
5.278449535369873
5.288309097290039
5.343402862548828
5.406036376953125
5.500717639923096
5.299688816070557
5.3729400634765625
5.416735649108887
5.367291450500488
5.392917156219482
5.361063480377197
5.547750473022461
5.357506275177002
5.402449131011963
5.387534141540527
5.388433933258057
5.382349491119385
5.351084232330322
5.346908092498779
5.319761276245117
5.355129241943359
5.268640995025635
5.40830135345459
5.359764099121094
5.423440933227539
5.401279926300049
5.346410751342773
5.3832478523254395
5.257132530212402
5.296072959899902
5.343481540679932
5.421532154083252
5.373501300811768
5.36629

In [20]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


.
Uzued Lalu Cas; abelsp’s 12)whes wiala...... twit litavedperredurghialunvewitisttha: d idU.. ans T
