In [None]:
!pip uninstall -y sentence-transformers spacy thinc catalogue cymem preshed blis srsly

In [None]:
!pip uninstall -y numpy
!rm -rf /usr/local/lib/python*/dist-packages/numpy*

In [None]:
!pip install -q kaggle

In [None]:
!pip install -q numpy==1.26.4 \
               transformers==4.52.2 \
               peft==0.11.0 \
               datasets accelerate bitsandbytes

In [None]:
import numpy, transformers, peft, datasets         # версии которые нужны:
print("NumPy:", numpy.__version__)                 # 1.26.4
print("Transformers:", transformers.__version__)   # 4.52.2
print("peft:", peft.__version__)                   # 0.11.0
print("datasets:", datasets.__version__)           # ≥ 2.14

In [None]:
DATA_DIR = "data/cmdw/"
MODEL_NAME = "bert-base-multilingual-cased"

In [None]:
os.makedirs('/root/.kaggle', exist_ok=True)
!mv kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c contradictory-my-dear-watson -p data

!unzip -q data/contradictory-my-dear-watson.zip -d data/cmdw

In [None]:
if os.path.exists(DATA_DIR):
    print("Файлы:")
    for f in os.listdir(DATA_DIR):
        print(" -", f)
else:
    print("Папка не найдена")

In [None]:
df = pd.read_csv(DATA_DIR + "train.csv")
print(df.columns)

In [None]:
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

In [None]:
df = pd.read_csv(DATA_DIR + "train.csv")

train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(example):
    return tokenizer(example["premise"], example["hypothesis"],
                     truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)

In [None]:
train_ds = train_ds.rename_column("label", "labels")
val_ds   = val_ds.rename_column("label", "labels")

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType

base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
)

lora_cfg = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)

model = get_peft_model(base_model, lora_cfg)
model.print_trainable_parameters()


In [None]:
from transformers import TrainingArguments, Trainer
import torch

BATCH = 8
LORA_EPOCHS = 3

args = TrainingArguments(
    output_dir="/content/lora_ckpt",
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    learning_rate=5e-5,
    num_train_epochs=LORA_EPOCHS,
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=2,
    logging_steps=100,
    report_to="none",
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds
    )

trainer.train()

In [None]:
ADAPTER_DIR = DATA_DIR + "lora_nli_mbert"
model.save_pretrained(ADAPTER_DIR)

from peft import PeftModel
merged = PeftModel.from_pretrained(base_model, ADAPTER_DIR).merge_and_unload()
MERGED_DIR = DATA_DIR + "merged-mbert-nli"
merged.save_pretrained(MERGED_DIR)

In [None]:
device = next(merged.parameters()).device
sample = tokenizer(
    "Cats are animals.",
    "A cat is a living being.",
    return_tensors="pt"
).to(device)

probs = merged(**sample).logits.softmax(-1)[0]
labels = ["Entailment", "Neutral", "Contradiction"]
print("Предсказание:", labels[probs.argmax()], "| conf:", float(probs.max()))

In [None]:
from huggingface_hub import HfApi

api = HfApi()

REPO_ID = "shapiropoly/merged-mbert-nli"

HfApi().upload_folder(
    folder_path=MERGED_DIR,
    repo_id=REPO_ID,
    repo_type="model"
)