In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('kaggle.csv')
texts = list(data['text'])

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('/content/gdrive/MyDrive/dl-model')
model = AutoModelForSequenceClassification.from_pretrained('/content/gdrive/MyDrive/dl-model')

In [None]:
perplexity_dict = {}

In [None]:
from tqdm.auto import tqdm
import math
import torch

In [None]:
perplexities = []

for text in tqdm(texts):
    tokenized_text = tokenizer.tokenize(text)
    max_length = model.config.max_position_embeddings
    num_chunks = len(tokenized_text) // max_length + 1
    perplexity_total = 0.0

    for i in range(num_chunks):
        chunk = tokenized_text[i * max_length: (i + 1) * max_length]
        if chunk:
            input_ids = tokenizer.convert_tokens_to_ids(chunk)
            input_ids = torch.tensor(input_ids).unsqueeze(0)

            with torch.no_grad():
                outputs = model(input_ids)
            predictions = outputs.logits
            num_tokens = len(input_ids[0])
            log_probabilities = torch.nn.functional.log_softmax(predictions, dim=-1)
            perplexity_chunk = torch.exp(-torch.sum(log_probabilities) / num_tokens)

            perplexity_total += perplexity_chunk.item()
    if num_chunks > 0:
        perplexity_avg = perplexity_total / num_chunks
    else:
        perplexity_avg = float('inf')

    perplexities.append(perplexity_avg)

  0%|          | 0/8263 [00:00<?, ?it/s]

In [None]:
data['perplexity'] = perplexities

In [None]:
data.to_csv('data+perplexity.csv')

In [None]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, e

## Попытки имплементировать другие метрики

In [None]:
el2n_scores = []

In [None]:
import numpy as np

In [None]:
# EL2N
for index, row in tqdm(data.iterrows()):
    text = row['text']
    sentiment = row['sentiment']

    # Tokenize the text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted label
    predicted_label = torch.argmax(outputs.logits).item()

    # Convert predicted label to sentiment value
    predicted_sentiment = f"LABEL_{predicted_label}"

    # Convert sentiment labels to numerical values
    sentiment_mapping = {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}
    predicted_sentiment_value = sentiment_mapping[predicted_sentiment]
    sentiment_mapping2 = {'neutral': 0, 'negative': 1, 'positive': 2}
    actual_sentiment_value = sentiment_mapping2[sentiment]

    # Calculate EL2N score
    el2n_score = np.linalg.norm(predicted_sentiment_value - actual_sentiment_value)

    # Append EL2N score to the list
    el2n_scores.append(el2n_score)

0it [00:00, ?it/s]

In [None]:
el2n_scores[:10]

[1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0]

In [None]:
data['el2n']

In [None]:
memorization_scores = []

# Iterate through each text in the training data
for index, row in data.iterrows():
    text = row['text']
    sentiment = row['sentiment']

    # Tokenize the text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted label
    predicted_label = torch.argmax(outputs.logits).item()
    predicted_sentiment = model.config.id2label[predicted_label]

    # Calculate accuracy for the individual text
    accuracy = 1 if predicted_sentiment == sentiment else 0

    # Append accuracy score to the list
    memorization_scores.append(accuracy)

KeyboardInterrupt: 