<a href="https://colab.research.google.com/github/ramzesssina/NLPrespos/blob/main/%D0%9B%D0%B0%D0%B1%D0%BE%D1%80%D0%B0%D1%82%D0%BE%D1%80%D0%BD%D0%B0%D1%8F_%D1%80%D0%B0%D0%B1%D0%BE%D1%82%D0%B0_3(%D0%9F%D1%83%D0%BD%D0%BA3GPT).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Пункт 3 - Создание GPT**

In [112]:
import numpy as np

In [113]:
def sigmoid(x):
    x = np.clip(x, -500, 500)
    return 1 / (1 + np.exp(-x))

def deriv_sigmoid(x):
    fx = sigmoid(x)
    return fx * (1 - fx)

In [114]:
class Neuron:
    def __init__(self, input_size):
        self.w = np.random.randn(input_size) * 0.1
        self.b = 0.0

    def feedforward(self, x):
        self.last_x = x
        self.last_total = np.dot(x, self.w) + self.b
        return sigmoid(self.last_total)

    def train(self, grad_output, lr=0.01):
        grad_total = deriv_sigmoid(self.last_total) * grad_output
        self.w -= lr * grad_total * self.last_x
        self.b -= lr * grad_total

In [139]:
class Head:
    def __init__(self, vocab_size, embed_dim, hidden_dim, lr=0.01):
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.lr = lr

        self.W_k = np.random.randn(embed_dim, hidden_dim) * 0.1
        self.W_q = np.random.randn(embed_dim, hidden_dim) * 0.1
        self.W_v = np.random.randn(embed_dim, hidden_dim) * 0.1

        # Линейный слой для предсказания
        self.W_out = np.random.randn(hidden_dim, 1) * 0.1
        self.b_out = np.zeros((1,))

    @staticmethod
    def softmax(x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

    def forward(self, x):
        self.x = x
        B, T, C = x.shape

        self.k = x @ self.W_k
        self.q = x @ self.W_q
        self.v = x @ self.W_v

        self.wei = np.matmul(self.q, self.k.transpose(0, 2, 1)) / np.sqrt(self.hidden_dim)
        mask = np.tril(np.ones((T, T)))
        self.wei = np.where(mask == 1, self.wei, -1e9)
        self.wei_softmax = self.softmax(self.wei)
        self.out = np.matmul(self.wei_softmax, self.v)  # (B, T, hidden_dim)

        # Предсказания
        self.pred = sigmoid(np.matmul(self.out, self.W_out) + self.b_out)  # (B, T, 1)
        return self.pred

    def train(self, dataset, epochs=1000):
        input_data = np.array([item[0] for item in dataset])
        target = np.array([item[1] for item in dataset])

        for epoch in range(epochs):
            pred = self.forward(input_data)

            # Loss: MSE
            loss = np.mean((pred - target) ** 2)

            # Градиенты для выходного слоя
            grad_pred = 2 * (pred - target) * deriv_sigmoid(pred)  # (B, T, 1)
            grad_W_out = np.sum(self.out[:, :, :, None] * grad_pred[:, :, None, :], axis=(0, 1))  # (hidden_dim, 1)
            grad_b_out = np.sum(grad_pred)

            # Обновление весов
            self.W_out -= self.lr * grad_W_out
            self.b_out -= self.lr * grad_b_out

            if epoch % 100 == 0:
                print(f"Epoch {epoch}: Loss = {loss:.4f}")

    def test(self, x_input):
        x_input = np.array(x_input).reshape(1, 4, 2)
        pred = self.forward(x_input)
        print(np.round(pred.squeeze(), 2))

In [140]:
X = ([
    [[0, 0], [0, 0], [0, 0], [0, 1]],
    [[0, 0], [1, 0], [0, 0], [0, 1]],
    [[1, 0], [0, 0], [0, 0], [0, 0]],
    [[0, 0], [0, 0], [0, 0], [1, 0]],
    [[0, 0], [0, 0], [0, 0], [0, 0]],
    [[0, 0], [1, 0], [0, 0], [0, 0]],
])

Y = ([
    [[1], [1], [1], [1]],
    [[1], [1], [1], [1]],
    [[1], [1], [1], [1]],
    [[1], [1], [1], [1]],
    [[0], [0], [0], [0]],
    [[1], [1], [1], [1]],
])

In [141]:
print("Обучение\n")
dataset = list(zip(X, Y))
head = Head(2, 2, 4)
head.train(dataset, epochs=3000)

Обучение

Epoch 0: Loss = 0.2514
Epoch 100: Loss = 0.1405
Epoch 200: Loss = 0.1390
Epoch 300: Loss = 0.1388
Epoch 400: Loss = 0.1387
Epoch 500: Loss = 0.1387
Epoch 600: Loss = 0.1386
Epoch 700: Loss = 0.1385
Epoch 800: Loss = 0.1384
Epoch 900: Loss = 0.1384
Epoch 1000: Loss = 0.1383
Epoch 1100: Loss = 0.1382
Epoch 1200: Loss = 0.1381
Epoch 1300: Loss = 0.1380
Epoch 1400: Loss = 0.1380
Epoch 1500: Loss = 0.1379
Epoch 1600: Loss = 0.1378
Epoch 1700: Loss = 0.1378
Epoch 1800: Loss = 0.1377
Epoch 1900: Loss = 0.1376
Epoch 2000: Loss = 0.1375
Epoch 2100: Loss = 0.1375
Epoch 2200: Loss = 0.1374
Epoch 2300: Loss = 0.1373
Epoch 2400: Loss = 0.1373
Epoch 2500: Loss = 0.1372
Epoch 2600: Loss = 0.1371
Epoch 2700: Loss = 0.1370
Epoch 2800: Loss = 0.1370
Epoch 2900: Loss = 0.1369


In [142]:
print("Тест\n")
head.test([[1, 1], [1, 1], [1, 1], [1, 1]])

Тест

[0.85 0.85 0.85 0.85]
