In [1]:
!pip install -U "transformers>=4.42.3" bitsandbytes accelerate peft

Collecting transformers>=4.42.3
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting accelerate
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-0.34.2-py3-none-any.whl (324 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import copy
from dataclasses import dataclass

import numpy as np
import pandas as pd
import pandas as pd
import torch
import functools
from torch.nn import functional as F
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
)
from peft import LoraConfig, PeftModel
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
@dataclass
class Config:
    gemma_dir = 'unsloth/gemma-2-9b-it-bnb-4bit'
    lora_dir = '/kaggle/input/rucodeh/output/checkpoint-256'
    max_length = 2048
    batch_size = 4
    device = torch.device('cuda')    

cfg = Config()

In [4]:
test = pd.read_csv('/kaggle/input/private-rucode-c/private_test.csv')

test['text'] = test['Фильм'].str.cat(test['Описание'], sep=' <sep> ')
test['text'] = test['text'].str.cat(test['Сюжет'], sep=' <sep> ')

In [5]:
tokenizer = GemmaTokenizerFast.from_pretrained(cfg.gemma_dir)
tokenizer.add_eos_token = True
tokenizer.add_sep_token = True
tokenizer.padding_side = 'right'

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

In [6]:
model = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    num_labels=14,
    torch_dtype=torch.float16,
    device_map=cfg.device,
    use_cache=False,
)

model = PeftModel.from_pretrained(model, cfg.lora_dir)

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
ds = DatasetDict({
    'test': Dataset.from_dict({'text': [str(x) for x in test['text'].tolist()]})
})

In [8]:
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    return tokenized_inputs

ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
ds = ds.with_format('torch')

Map:   0%|          | 0/123 [00:00<?, ? examples/s]

In [9]:
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }

In [10]:
test_dataloader = DataLoader(ds['test'], batch_size=cfg.batch_size, collate_fn=collate_fn)

In [11]:
from tqdm import tqdm

In [12]:
model.eval()
all_logits = []

with torch.no_grad():
    for batch in tqdm(test_dataloader):
        inputs = {k: v.to(model.device) for k, v in batch.items()}
        logits = model(**inputs).logits
        all_logits.append(logits.cpu())

all_logits = torch.cat(all_logits, dim=0)
probs = torch.sigmoid(all_logits).numpy()

100%|██████████| 31/31 [18:20<00:00, 35.50s/it]


In [13]:
threshold = 0.2
predictions = (probs > threshold).astype(int)

In [14]:
train = pd.read_csv('/kaggle/input/rucode-movies/movie_genres/train.csv')
train['Жанры'] = train['Жанры'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer().fit(train['Жанры'])
predicted_genres = mlb.inverse_transform(predictions)

test['Жанры'] = [', '.join(genres) for genres in predicted_genres]
test['Жанры'] = test['Жанры'].replace('', np.nan)

test[['Фильм', 'Жанры']].to_csv('2epoch_gemma2_dora_freeze8_thold0_2.csv')