In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

import time
device ='cuda' if torch.cuda.is_available() else 'cpu' #bibliothèque pour installer GPU ?
print(device)
batch_size = 64 # combien de block-size on veut en même temps
block_size=128 # longueur de la sequence
max_iters= 3000
vocab_size=81
eval_iters=100
n_embd=384
n_layer=4 # nombre de decoder # 8
n_head =4 #8
dropout=0.2


cpu


In [17]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text=f.read()
print(len(text))
chars = sorted(set(text))

232313


In [18]:
string_to_int = {ch:i for i,ch in enumerate(chars)} # C’est une table d’encodage : caractère → entier.
int_to_string = {i:ch for i,ch in enumerate(chars)} # C’est la table de décodage : entier → caractère.  
encode = lambda s:[string_to_int[c] for c in s] # encode("Hi!") → [42, 53, 12] (chiffres fictifs)
decode = lambda l: ''.join([int_to_string[i] for i in l]) # decode([42, 53, 12]) → "Hi!"

# encoded_hello = encode("hello")
# decoded_hello =decode(encoded_hello)
# print(encoded_hello)
# print(decoded_hello)

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([80,  0,  0,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44,
        32, 29,  1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,
         1, 26, 49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25,
        45, 37,  0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32,
        29,  1, 47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32,
        29,  1, 36, 25, 38, 28,  1, 39, 30,  1])


In [19]:
#data tous les caractères du text
n = int(0.8*len(data))
train_data = data[:n]
val_data =data[n:]
# print(n)
# print(train_data.shape)
# print(val_data.shape)

def get_batch(split):
    data =train_data if split=='train'else val_data
    ix =torch.randint(len(data)-block_size,(batch_size,))
    #print(ix)                                                                        # 4 indices aux hasards
    x=torch.stack([data[i:i+block_size] for i in ix]) # Ajoute plus 8 au niveau des indices  Ex 120 -> 128 -> 8 indices (0,5,20,19,39,7,3,1,78)
    y=torch.stack([data[i+1:i+block_size+1] for i in ix]) #séquence de token décaler de 1 Ex (5,20,19,39,7,3,1,78,8)
    return x,y

x,y=get_batch('train')
print('inputs',x) # 4 batchs de 8 séquences
print('outputs',y)

inputs tensor([[ 1, 72, 73,  ..., 75, 58,  1],
        [68, 67,  9,  ..., 67, 57,  1],
        [73, 71, 54,  ...,  1, 68, 67],
        ...,
        [61, 58,  1,  ..., 58, 58,  1],
        [61, 62, 72,  ...,  1,  5, 58],
        [72, 66, 54,  ..., 61, 73,  9]])
outputs tensor([[72, 73, 71,  ..., 58,  1, 72],
        [67,  9,  1,  ..., 57,  1, 70],
        [71, 54, 65,  ..., 68, 67,  1],
        ...,
        [58,  1, 31,  ..., 58,  1, 66],
        [62, 72,  1,  ...,  5, 58, 66],
        [66, 54, 65,  ..., 73,  9,  1]])


In [28]:

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) Pour chaque value = 0 on va donner _inf
        wei = F.softmax(wei, dim=-1) # (B, T, T) Focus sur une value qui est big mais pas trop non plus
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out


class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) # 4 heads running in parallel
        self.proj = nn.Linear(head_size * num_heads, n_embd) 
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out
    

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head # 96 features
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd) # Linear,relu,linear
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)          # Multihead Attention
        x = self.ln1(x + y)     # Add norm
        y = self.ffwd(x)        # Feed forward
        x = self.ln2(x + y)     # Add norm
        return x

class GPTLanguageModel(nn.Module):
    # initialisation
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,n_embd) # matrice 81 × 81
        self.position_embedding_table = nn.Embedding(block_size,n_embd) 
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]) # 4 decoder Pas en parralèle attendent que un bloc soit finit pour construire l'autre
        
        self.ln_f = nn.LayerNorm(n_embd) # Normaliser
        self.lm_head = nn.Linear(n_embd, vocab_size) # Linear ajouté à la fin du layer
        self.apply(self._init_weights)

    def _init_weights(self, module):  # Initialiser les poids avec une standart deviation qui est correcte sans trop d'outlier
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    # index = séquences d'entrées (tokens)
    # targets = séquences à prédire
    def forward(self,index,targets=None):
       # logits=self.token_embedding_table(index)                     # (4,8)
        B,T=index.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)
        

        if targets is None:
            loss = None 
        else:
            # B -> nombre de séquences dans le batch (4) 
            # T -> nombre de tokens par séquence (8)
            # C -> nombre de classes = vocab_size = 81
            B, T, C =logits.shape                                 # (4,8,81) 
            logits = logits.view(B*T,C)                              # (32,81) -> chaque position dans chaque séquence prédit 81 scores
            targets = targets.view(B*T)                              # (32,) -> target prédit 1 entier par position
            loss = F.cross_entropy(logits, targets)                  # score prédit par le modèle (logits)/softmax sur 81 score/ la bonne class(target)
        return logits, loss                                          # Modele juste -> loss 0 / modele faux loss forte || Return logit pour le mot qu'on a généré
    
    def generate(self, index, max_new_tokens):                      
        for _ in range(max_new_tokens):                              # Générer un token à la fois
            logits,loss =self.forward(index)                         # Index -> forward -> logits (token prédit)/ renvoie None pour loss car on ne veut pas comparer
            logits = logits[:,-1,:]                                 # (1,1,81) on garde que le dernier pas de T 
            probs = F.softmax(logits, dim=-1)                        # convertir en probabilité/ Probabilité sur les 81 valeurs/ type de normalisation
            index_next =torch.multinomial(probs,num_samples=1)      # (1,1) Tire l'échantillon avec la plus forte proba 
            index = torch.cat((index,index_next), dim=-1)            # Ajoute le nouveau token à la sequence Ex: index = [[0]] -> index =[[0,13]]
        return index                                                 # Return tout la séquence à la fin jusqu'à max_new_tokens=500

model =GPTLanguageModel(vocab_size)
m=model

# context=torch.zeros((1,1),dtype=torch.long,device=device) # Tu démarres avec un seul token : 0.
# generated_tokens = m.generate(context, max_new_tokens=500) # On génère 500 tokens
# generated_chars = ''.join([int_to_string[int(i)] for i in generated_tokens.flatten()])
# print(generated_chars)     

In [21]:
@torch.no_grad()
def estimate_loss():
    out={}
    model.eval()
    for split in ['train','val']:
        losses =torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y =get_batch(split)
            logits,loss=model(X,Y)
            losses[k]=loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [22]:
learning_rate = 3e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters==0:
        losses=estimate_loss()
        print(f"steps : {iter}: train loss{losses['train']:.4f}, val loss {losses['val']:.4f}")
    xb,yb=get_batch('train')

    logits,loss =model.forward(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())


steps : 0: train loss4.4621, val loss 4.4576
steps : 100: train loss2.4058, val loss 2.4696
steps : 200: train loss2.1880, val loss 2.2737
steps : 300: train loss1.9195, val loss 2.0387
steps : 400: train loss1.7243, val loss 1.8621
steps : 500: train loss1.5988, val loss 1.7530
steps : 600: train loss1.5040, val loss 1.6846
steps : 700: train loss1.4295, val loss 1.6353
steps : 800: train loss1.3762, val loss 1.6061
steps : 900: train loss1.3193, val loss 1.5600
steps : 1000: train loss1.2693, val loss 1.5321
steps : 1100: train loss1.2381, val loss 1.5136
steps : 1200: train loss1.2030, val loss 1.5123
steps : 1300: train loss1.1757, val loss 1.5019
steps : 1400: train loss1.1406, val loss 1.4860


KeyboardInterrupt: 

In [32]:
context=torch.zeros((1,1),dtype=torch.long,device=device) # Tu démarres avec un seul token : 0.
generated_tokens = m.generate(context, max_new_tokens=120) # On génère 500 tokens
print("max token generated =", generated_tokens.max().item())
generated_chars = ''.join([int_to_string[int(i)] for i in generated_tokens.flatten()])
print(generated_chars)    

max token generated = 80

q)_g!'(e8qz7AaPfQG-,hIeYJYw﻿L"R(
﻿?(j7';﻿r*6meGKCDBrF*Ys3;:47aM&vMt!FZ(yuK4,Y!!vW'KvMEiqAgizhPc[9"lC-b.K!.geNp'8w&qjtiU

