In [1]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW

In [2]:
videos_df = pd.read_parquet('videos.parquet',
    engine='fastparquet',
    columns=['video_id', 'video_title'])
features_df = pd.read_parquet('features.parquet', engine = "fastparquet")


In [3]:
# Объединить данные из разных файлов по video_id
merged_df = pd.merge(features_df, videos_df, on='video_id')
del features_df
del videos_df
automarkup_df = pd.read_parquet('automarkup.parquet', engine = "fastparquet")


In [None]:
# Выбрать нужные колонки для обучения
input_cols = ['video_title', 'v_pub_datetime']
target_col = 'query'
df = automarkup_df[[target_col, 'video_id']].merge(merged_df[input_cols + ['video_id']], on='video_id')
# Удалить пропущенные значения
del automarkup_df


df.dropna(subset=[target_col, 'video_title', 'v_pub_datetime'], inplace=True)

In [None]:
# Инициализировать токенизатор BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Определить максимальную длину последовательности
max_seq_length = 128
# Создать класс датасета
class VideoDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        video_title = row['video_title']
        v_pub_datetime = row['v_pub_datetime']
        query = row[target_col]
        
        # Преобразование текстового значения query в числовую метку
        query_label = query_to_label[query]
        
        inputs = self.tokenizer.encode_plus(
            video_title,
            v_pub_datetime,
            add_special_tokens=True,
            max_length=self.max_seq_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        
        return input_ids, attention_mask, torch.tensor(query_label)



    def collate_fn(self, batch):
        input_ids_list, attention_mask_list, query_list = zip(*batch)
        input_ids_list = torch.stack(input_ids_list, dim=0)
        attention_mask_list = torch.stack(attention_mask_list, dim=0)
    
        return input_ids_list, attention_mask_list, query_list

    
    





# Создать загрузчик данных
batch_size = 32
dataset = VideoDataset(df, tokenizer, max_seq_length)
del df
del tokenizer
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=dataset.collate_fn)
del dataset


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class VideoSearchModel(nn.Module):
    def __init__(self, num_classes):
        super(VideoSearchModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits
# Инициализировать модель
num_classes = 1
model = VideoSearchModel(num_classes)
# Перенести модель на GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

VideoSearchModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [None]:
# Определить функцию потерь и оптимизатор
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
# Обучение модели
num_epochs = 10
model.train()
for epoch in range(num_epochs):
    for input_ids, attention_mask, targets in dataloader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        
        loss = criterion(outputs.squeeze(), targets.float())
        
        loss.backward()
        optimizer.step()



ValueError: invalid literal for int() with base 10: 'Интервью Димы Матвеева'

In [None]:
torch.save(model.state_dict(), 'video_search_model.pth')

In [None]:
# Загрузить сохраненную модель
model = VideoSearchModel(num_classes)
model.load_state_dict(torch.load('video_search_model.pth'))
model.to(device)
# Функция для выполнения поиска видео
def search_videos(queries):
    model.eval()
    results = []
    
    for query in queries:
        input_ids, attention_mask, _ = dataset[0]
        input_ids = input_ids.unsqueeze(0).to(device)
        attention_mask = attention_mask.unsqueeze(0).to(device)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask)
            prediction = torch.sigmoid(outputs).item()
            
        results.append((prediction, query))
    
    return results
# Пример использования
queries = ['query1', 'query2', 'query3']
results = search_videos(queries)
# Сохранить результаты в файл csv
results_df = pd.DataFrame(results, columns=['id_video', 'query'])
results_df.to_csv('results.csv', index=False)