# Attentive Search-Driven Inference (ASDI)
«Inference-free sentiment classifier»,

«Learn-free attention-enhanced semantic indexing»,

«Meta-embedding refinement for semantic search».

In [4]:
!pip install pandas ace_tools

Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting ace_tools
  Using cached ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.4-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
Using cached ace_tools-0.0-py3-none-any.whl (1.1 kB)
Downloading numpy-2.2.4-cp312-cp312-win_amd64.whl (12.6 MB)
   ---------------------------------------- 0.0/12.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.6 MB ? eta -:--:--
    --------------------------------------- 0.3/12.6 MB ? eta -:--:--
    --------------------------------------- 0.3/12.6 MB ? eta -:--:--
   - -------------------------------------- 0.5/12

In [8]:
! pip install ace_tools



# Baseline

In [3]:
# Re-import required libraries after environment reset
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
from datasets import load_dataset

# Load dataset (SST2)
dataset = load_dataset("glue", "sst2")
train_texts = dataset["train"]["sentence"]
train_labels = dataset["train"]["label"]
test_texts = dataset["validation"]["sentence"]
test_labels = dataset["validation"]["label"]

# Load tokenizer and model (BERT base, uncased)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
model.eval()


# Function to extract CLS embedding from BERT
def get_cls_embedding(texts, tokenizer, model):
    embeddings = []
    batch_size = 32
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_tokens = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_tokens)
    return np.vstack(embeddings)


# Generate embeddings for train and test sets
train_embeddings = get_cls_embedding(train_texts, tokenizer, model)
test_embeddings = get_cls_embedding(test_texts, tokenizer, model)

# Build FAISS index
dimension = train_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(train_embeddings)

# Perform k-NN search
k = 5
distances, indices = index.search(test_embeddings, k)

# Predict based on majority vote of k nearest neighbors
predicted_labels = []
for i in range(len(indices)):
    neighbor_labels = [train_labels[idx] for idx in indices[i]]
    majority_vote = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(majority_vote)

# Classification report
report = classification_report(test_labels, predicted_labels, output_dict=True)
report_df = pd.DataFrame(report).transpose()
selected_rows = ["0", "1", "accuracy", "macro avg", "weighted avg"]
print("\nClassification Report (VDB baseline using CLS token):")
print(report_df.loc[selected_rows])

# Confusion matrix visualization
conf_matrix = confusion_matrix(test_labels, predicted_labels)
fig = px.imshow(
    conf_matrix,
    labels=dict(x="Predicted Label", y="True Label", color="Count"),
    x=["Negative", "Positive"],
    y=["Negative", "Positive"],
    text_auto=True,
    title="Confusion Matrix: VDB Baseline (CLS Token)",
)

fig.show()


Classification Report (VDB baseline using CLS token):
              precision    recall  f1-score     support
0              0.772500  0.721963  0.746377  428.000000
1              0.747881  0.795045  0.770742  444.000000
accuracy       0.759174  0.759174  0.759174    0.759174
macro avg      0.760191  0.758504  0.758560  872.000000
weighted avg   0.759965  0.759174  0.758783  872.000000


In [None]:
# Добавляем предобученный модель внимания
# Результат - снижение точности
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
from datasets import load_dataset

# Load SST2 dataset
dataset = load_dataset("glue", "sst2")
train_texts = dataset["train"]["sentence"]
train_labels = dataset["train"]["label"]
test_texts = dataset["validation"]["sentence"]
test_labels = dataset["validation"]["label"]

# Load tokenizer and model (BERT base, uncased)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
model.eval()


# Function to extract embedding via self-attention refinement (mean of all token representations after last layer)
def get_attention_refined_embedding(texts, tokenizer, model):
    embeddings = []
    batch_size = 32
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
        attention_mask = inputs["attention_mask"]
        with torch.no_grad():
            outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)

        # Apply attention-aware mean pooling
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * mask_expanded, dim=1)
        sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
        refined_embeddings = (sum_embeddings / sum_mask).cpu().numpy()
        embeddings.append(refined_embeddings)
    return np.vstack(embeddings)


# Generate embeddings with attention-based refinement
train_embeddings = get_attention_refined_embedding(train_texts, tokenizer, model)
test_embeddings = get_attention_refined_embedding(test_texts, tokenizer, model)

# Build FAISS index
dimension = train_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(train_embeddings)

# Perform k-NN search
k = 5
distances, indices = index.search(test_embeddings, k)

# Predict based on majority vote of k nearest neighbors
predicted_labels = []
for i in range(len(indices)):
    neighbor_labels = [train_labels[idx] for idx in indices[i]]
    majority_vote = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(majority_vote)

# Classification report
report = classification_report(test_labels, predicted_labels, output_dict=True)
report_df = pd.DataFrame(report).transpose()
selected_rows = ["0", "1", "accuracy", "macro avg", "weighted avg"]
print("\nClassification Report (VDB + Attention Refinement):")
print(report_df.loc[selected_rows])

# Confusion matrix visualization
conf_matrix = confusion_matrix(test_labels, predicted_labels)
fig = px.imshow(
    conf_matrix,
    labels=dict(x="Predicted Label", y="True Label", color="Count"),
    x=["Negative", "Positive"],
    y=["Negative", "Positive"],
    text_auto=True,
    title="Confusion Matrix: VDB + Attention Refinement",
)

fig.show()

  from .autonotebook import tqdm as notebook_tqdm



Classification Report (VDB + Attention Refinement):
              precision    recall  f1-score     support
0              0.770053  0.672897  0.718204  428.000000
1              0.718876  0.806306  0.760085  444.000000
accuracy       0.740826  0.740826  0.740826    0.740826
macro avg      0.744464  0.739602  0.739145  872.000000
weighted avg   0.743995  0.740826  0.739529  872.000000


In [None]:
# Full experimental pipeline with trainable attention pooler and frozen BERT

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import pandas as pd
import plotly.express as px
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# 1. Load dataset
sst2 = load_dataset("glue", "sst2")
train_texts = sst2["train"]["sentence"]
train_labels = sst2["train"]["label"]
test_texts = sst2["validation"]["sentence"]
test_labels = sst2["validation"]["label"]

# 2. Tokenizer and frozen BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert = AutoModel.from_pretrained("bert-base-uncased")
for param in bert.parameters():
    param.requires_grad = False


# 3. Attention Pooler
class AttentionPooler(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attention = nn.Linear(hidden_size, 1)

    def forward(self, hidden_states, mask):
        scores = self.attention(hidden_states).squeeze(-1)  # (batch, seq_len)
        scores = scores.masked_fill(mask == 0, -1e9)
        weights = F.softmax(scores, dim=-1)
        output = torch.sum(hidden_states * weights.unsqueeze(-1), dim=1)
        return output


# 4. Classification model
class SentimentModel(nn.Module):
    def __init__(self, bert, hidden_size):
        super().__init__()
        self.bert = bert
        self.pooler = AttentionPooler(hidden_size)
        self.classifier = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.pooler(outputs.last_hidden_state, attention_mask)
        return self.classifier(pooled)


# 5. Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item


# 6. Prepare data
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 7. Train attention pooler + classifier
model = SentimentModel(bert, hidden_size=768)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

model.train()
for epoch in range(3):
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")

# 8. Evaluation
model.eval()
predicted_labels = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        predicted_labels.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# 9. Report
report = classification_report(true_labels, predicted_labels, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print("\nClassification Report (Attention-Pooler Fine-tuned):")
print(report_df.loc[["0", "1", "accuracy", "macro avg", "weighted avg"]])

# 10. Confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
fig = px.imshow(
    conf_matrix,
    labels=dict(x="Predicted", y="True", color="Count"),
    x=["Negative", "Positive"],
    y=["Negative", "Positive"],
    text_auto=True,
    title="Confusion Matrix: Fine-tuned Attention-Pooler",
)
fig.show()

100%|██████████| 2105/2105 [45:19<00:00,  1.29s/it]


Epoch 1, Loss: 0.4333


100%|██████████| 2105/2105 [47:27<00:00,  1.35s/it]


Epoch 2, Loss: 0.3346


100%|██████████| 2105/2105 [48:31<00:00,  1.38s/it]


Epoch 3, Loss: 0.3223

Classification Report (Attention-Pooler Fine-tuned):
              precision    recall  f1-score     support
0              0.901914  0.880841  0.891253  428.000000
1              0.887665  0.907658  0.897550  444.000000
accuracy       0.894495  0.894495  0.894495    0.894495
macro avg      0.894790  0.894249  0.894402  872.000000
weighted avg   0.894659  0.894495  0.894459  872.000000


In [None]:
# Чистая LLM
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
import torch
from sklearn.metrics import classification_report, confusion_matrix
import plotly.express as px
import pandas as pd

# Загрузка той же архитектуры с дообученной головой
model_cls = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-SST-2")
model_cls.eval()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Предсказание на test_texts
batch_size = 32
predicted_labels = []

with torch.no_grad():
    for i in range(0, len(test_texts), batch_size):
        batch = test_texts[i : i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=128)
        outputs = model_cls(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predicted_labels.extend(preds.cpu().numpy())

# Оценка
report = classification_report(test_labels, predicted_labels, output_dict=True)
report_df = pd.DataFrame(report).transpose()
selected_rows = ["0", "1", "accuracy", "macro avg", "weighted avg"]
print("\nClassification Report (LLM BERT-base fine-tuned on SST-2):")
print(report_df.loc[selected_rows])

# Визуализация
conf_matrix = confusion_matrix(test_labels, predicted_labels)
fig = px.imshow(
    conf_matrix,
    labels=dict(x="Predicted Label", y="True Label", color="Count"),
    x=["Negative", "Positive"],
    y=["Negative", "Positive"],
    text_auto=True,
    title="Confusion Matrix: BERT-base Fine-tuned SST-2",
)

fig.show()