In [1]:
!pip install tree_sitter


Collecting tree_sitter
  Downloading tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.8 kB)
Downloading tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (575 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/575.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/575.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m573.4/575.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m575.6/575.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tree_sitter
Successfully installed tree_sitter-0.24.0


In [6]:
!pip uninstall -y tree_sitter
!pip install tree_sitter==0.20.1 tree_sitter_languages==1.5.1


Found existing installation: tree-sitter 0.24.0
Uninstalling tree-sitter-0.24.0:
  Successfully uninstalled tree-sitter-0.24.0
Collecting tree_sitter==0.20.1
  Downloading tree_sitter-0.20.1.tar.gz (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.2/126.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[31mERROR: Could not find a version that satisfies the requirement tree_sitter_languages==1.5.1 (from versions: 1.5.0, 1.6.1, 1.7.0, 1.8.0, 1.9.1, 1.10.0, 1.10.2)[0m[31m
[0m[31mERROR: No matching distribution found for tree_sitter_languages==1.5.1[0m[31m
[0m

In [8]:
import re

def simple_code_tokenizer(code):
    pattern = r'''
        [a-zA-Z_]\w*     |  # từ khóa, tên biến
        \d+              |  # số
        ==|!=|<=|>=      |  # toán tử
        [+\-*/=;{}(),<>] |  # ký tự đặc biệt
        "[^"]*"          |  # chuỗi
        .                # còn lại
    '''
    tokens = re.findall(pattern, code, re.VERBOSE)
    return tokens
# Ví dụ: đoạn mã sai (thiếu dấu ; sau 5)
buggy_code = 'void main() { int a = 5 printf("Value: %d", a); }'
fixed_code = 'void main() { int a = 5; printf("Value: %d", a); }'

buggy_tokens = simple_code_tokenizer(buggy_code)
fixed_tokens = simple_code_tokenizer(fixed_code)

print("Buggy:", buggy_tokens)
print("Fixed:", fixed_tokens)
from collections import Counter

all_tokens = buggy_tokens + fixed_tokens
vocab = ['<PAD>', '<UNK>'] + list(set(all_tokens))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}

def encode(tokens, max_len=30):
    return [word2idx.get(t, word2idx['<UNK>']) for t in tokens] + [word2idx['<PAD>']] * (max_len - len(tokens))

X = encode(buggy_tokens)
Y = encode(fixed_tokens)


Buggy: ['void', ' ', 'main', '(', ')', ' ', '{', ' ', 'int', ' ', 'a', ' ', '=', ' ', '5', ' ', 'printf', '(', '"Value: %d"', ',', ' ', 'a', ')', ';', ' ', '}']
Fixed: ['void', ' ', 'main', '(', ')', ' ', '{', ' ', 'int', ' ', 'a', ' ', '=', ' ', '5', ';', ' ', 'printf', '(', '"Value: %d"', ',', ' ', 'a', ')', ';', ' ', '}']


In [9]:
import torch
import torch.nn as nn

class CodeFixLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128, num_layers=2):
        super(CodeFixLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out


In [10]:
model = CodeFixLSTM(len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

inputs = torch.tensor([X])
targets = torch.tensor([Y])

for epoch in range(300):
    model.train()
    outputs = model(inputs)
    loss = criterion(outputs.view(-1, len(vocab)), targets.view(-1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")


Epoch 0, Loss: 2.8394
Epoch 50, Loss: 0.6483
Epoch 100, Loss: 0.0445
Epoch 150, Loss: 0.0154
Epoch 200, Loss: 0.0088
Epoch 250, Loss: 0.0058


In [11]:
model.eval()
with torch.no_grad():
    output = model(inputs)
    predicted = torch.argmax(output, dim=2).squeeze().tolist()

pred_tokens = [idx2word[idx] for idx in predicted if idx != word2idx['<PAD>']]
print("🔧 Đoạn mã đã sửa:")
print(" ".join(pred_tokens))


🔧 Đoạn mã đã sửa:
void   main ( )   {   int   a   =   5 ;   printf ( "Value: %d" ,   a ) ;   }
