In [2]:
import torch.nn as nn
import torch.optim as optim
import tiktoken
config = {
    "embed_dim": 128,
    "num_heads": 8,
    "max_len": 100,
    "num_classes": 1,
    "batch_size": 2,
    "num_epochs": 5,
    "lr": 0.001
}
encoder = tiktoken.get_encoding("cl100k_base")

In [None]:
import json
import torch

with open("train_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

texts = []
labels = []

for item in data:
    texts.append(item["text"])
    labels.append(item["label"])

inputs = []
targets = []

for text, label in zip(texts, labels): # Accoppia le liste zip
    tokens = encoder.encode(text)
    if len(tokens) < config["max_len"]:
        tokens += [0] * (config["max_len"] - len(tokens))
    else:
        tokens = tokens[:config["max_len"]]
    inputs.append(tokens)
    targets.append(labels)

inputs = torch.tensor(inputs, dtype=torch.long)
targets = torch.tensor(labels, dtype=torch.float)

In [None]:
batch_size = 2 # Livello di blocchi
dataset_size = inputs.size(0)

indices = torch.randperm(dataset_size)
shuffled_inputs = inputs[indices]
shuffled_targets = targets[indices]

for start_idx in range(0, dataset_size, batch_size):
    end_idx = start_idx + batch_size
    batch_inputs = shuffled_inputs[start_idx:end_idx]
    batch_targets = shuffled_targets[start_idx:end_idx]


In [13]:
import torch.nn as nn
import torch

class Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, max_len = 100):
        super().__init__()
        
        self.embedding = nn.Embedding(encoder.n_vocab, embed_dim)
        
        self.pos_embedding = nn.Parameter(torch.randn(1, max_len, embed_dim))
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model = embed_dim,
            nhead = num_heads,
            batch_first = True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers = 1)
        
        self.fc = nn.Linear(embed_dim, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        
        seq_len = x.size(1)
        
        x = x + self.pos_embedding[:, :seq_len, :]
        
        x = self.transformer_encoder(x)
        
        x = x.mean(dim=1)
        
        x = self.fc(x)
        
        return x

model = Model(
    encoder.n_vocab,
    config["embed_dim"],
    config["num_heads"],
    config["max_len"]
)

In [6]:
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

num_epochs = 20

for epoch in range(num_epochs):
    indices = torch.randperm(dataset_size)
    shuffled_inputs = inputs[indices]
    shuffled_targets = targets[indices]

    for start_idx in range(0, dataset_size, batch_size):
        end_idx = start_idx + batch_size
        batch_inputs = shuffled_inputs[start_idx:end_idx]
        batch_targets = shuffled_targets[start_idx:end_idx]

        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs.squeeze(), batch_targets)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")


Epoch 1/20, Loss: 0.6868
Epoch 2/20, Loss: 0.6552
Epoch 3/20, Loss: 0.5603
Epoch 4/20, Loss: 0.2165
Epoch 5/20, Loss: 0.0920
Epoch 6/20, Loss: 0.2450
Epoch 7/20, Loss: 0.0115
Epoch 8/20, Loss: 0.0076
Epoch 9/20, Loss: 0.0062
Epoch 10/20, Loss: 0.0056
Epoch 11/20, Loss: 0.0047
Epoch 12/20, Loss: 0.0040
Epoch 13/20, Loss: 0.0043
Epoch 14/20, Loss: 0.0046
Epoch 15/20, Loss: 0.0034
Epoch 16/20, Loss: 0.0035
Epoch 17/20, Loss: 0.0029
Epoch 18/20, Loss: 0.0037
Epoch 19/20, Loss: 0.0027
Epoch 20/20, Loss: 0.0028


In [14]:
new_sentence = "Il concerto è stato bellissimo"
tokens = encoder.encode(new_sentence)

input_tensor = torch.tensor([tokens], dtype=torch.long)
model.eval()  
with torch.no_grad():
    output = model(input_tensor)
    prob = torch.sigmoid(output)
if prob.item() > 0.5:
    print("Predizione: Positivo, Probabilità:", prob.item())
else:
    print("Predizione: Negativo, Probabilità:", prob.item())


Predizione: Positivo, Probabilità: 0.5992962718009949
