### Baseline для хакатона Rutube по задаче "Теггирование видео"

В рамках данного ноутбука мы рассмотрим наивный подход к решению поставленной задачи: векторный поиск навания видео в базе векторов тегов.

В конце есть пример получения sample_submission.csv - пример файла, который нужно загрузить на лидерборд.


In [3]:
!ls

IAB_tags.csv		      download_trimmed_dataset.ipynb
baseline-newembeddings.ipynb  requirements.txt
baseline.ipynb		      train_data_categories.csv


In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import json
from tqdm.autonotebook import tqdm
import numpy as np 
import faiss

  from tqdm.autonotebook import tqdm, trange


#### Берем данные с id видео и его названием, также загружаем иерархические теги

In [None]:
locals()

In [2]:
data = pd.read_csv("baseline/train_data_categories.csv",).dropna()[['video_id', 'title']]
taxonomy = pd.read_csv("baseline/IAB_tags.csv")

print(data.columns)
print(data.head(5))

print(taxonomy.head(5))

print(taxonomy.columns)

Index(['video_id', 'title'], dtype='object')
                           video_id  \
0  9007f33c8347924ffa12f922da2a179d   
1  9012707c45233bd601dead57bc9e2eca   
2  e01d6ebabbc27e323fa1b7c581e9b96a   
3  a00b145242be3ebc3b311455e94917af   
4  b01a682bf4dfcc09f1e8fac5bc18785a   

                                               title  
0  Пацанский клининг. Шоу «ЧистоТачка» | Повелите...  
1  СarJitsu. 3 сезон, 6 серия. Нарек Симонян vs Ж...  
2  Злые языки | Выпуск 1, Сезон 1 | Непорочность ...  
3                 $1000 шоу | 1 выпуск | Автобоулинг  
4                    В РОТ МНЕ НОТЫ #1 ВИТА ЧИКОВАНИ  
  Уровень 1 (iab)         Уровень 2 (iab)      Уровень 3 (iab)
0       Транспорт                     NaN                  NaN
1       Транспорт  Типы кузова автомобиля                  NaN
2       Транспорт  Типы кузова автомобиля  Грузовой автомобиль
3       Транспорт  Типы кузова автомобиля                Седан
4       Транспорт  Типы кузова автомобиля            Универсал
Index(['Уров

#### Для создания эмбеддинга берем русскоязычный Берт и загружаем в sentence transformer, который позволяет создавать эмбеддинг для всего предложения и сам обрезает его до максимально возможного числа токенов

In [6]:
model = SentenceTransformer('DeepPavlov/rubert-base-cased-sentence', )
dim = 768 # размер вектора эмбеддинга

No sentence-transformers model found with name DeepPavlov/rubert-base-cased-sentence. Creating a new one with mean pooling.


#### Создаем эмбеддинги для названий видео

In [7]:
data['title_vector'] = data['title'].apply(lambda l: model.encode(l, convert_to_tensor=True).cpu().numpy())

#### Создаем векторы для тегов:
 Для каждого 1 уровня иерархии в отдельности и для следующих уровней формата уровень 1: уровень 2: уровень 3 

In [4]:
def get_tags():
    tags = {}
    for i, row in tqdm(taxonomy.iterrows()):
        if isinstance(row['Уровень 1 (iab)'], str):
            tags[row['Уровень 1 (iab)']] = model.encode(row['Уровень 1 (iab)'], convert_to_tensor=True).cpu().numpy()#.tolist()
        if isinstance(row['Уровень 2 (iab)'], str):
            tags[row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)']] = model.encode(row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)'], convert_to_tensor=True).cpu().numpy()#.tolist()
        if isinstance(row['Уровень 3 (iab)'], str):
            tags[row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)']+": "+row['Уровень 3 (iab)']] = model.encode(row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)']+": "+row['Уровень 3 (iab)'], convert_to_tensor=True).cpu().numpy()#.tolist()
    return tags

# tags = get_tags()
# tags_list = list(tags.keys())
# vectors = np.array(list(tags.values()))

### Создаем векторную базу faiss для эффективного векторного поиска

In [9]:
index = faiss.index_factory(dim, "Flat", faiss.METRIC_INNER_PRODUCT)
print(index.ntotal)
index.add(vectors)
print(index.ntotal)

0
610


#### Смотрим несколько получившихся примеров 
Генерим по 3 близких предсказания для каждого названия видео

In [10]:
topn = 3
scores, predictions = index.search(np.array(data['title_vector'].to_list()[:10]), topn)
for j, i in enumerate(predictions):
    print("SCORES", scores[j])
    print("PREDICTION_by_title", np.array(tags_list)[predictions[j]])
    print("SAMPLE", data['title'].to_list()[:10][j])
    print("\n")

SCORES [280.2826  268.9419  264.77997]
PREDICTION_by_title ['Семья и отношения: Развод' 'Массовая культура: Смерти знаменитостей'
 'Массовая культура: Скандалы знаменитостей']
SAMPLE Пацанский клининг. Шоу «ЧистоТачка» | Повелитель ночи | Выпуск 17


SCORES [248.28537 242.65976 241.96161]
PREDICTION_by_title ['Массовая культура: Скандалы знаменитостей' 'Игры: Киберспорт'
 'Спорт: Дартс']
SAMPLE СarJitsu. 3 сезон, 6 серия. Нарек Симонян vs Жека Секси


SCORES [291.92664 287.7805  280.69467]
PREDICTION_by_title ['Семья и отношения: Развод'
 'Медицина: Медицинские направления: Простуда и грипп'
 'Новости и политика: Политика: Война и конфликты']
SAMPLE Злые языки | Выпуск 1, Сезон 1 | Непорочность Даны Борисовой


SCORES [247.24628 243.45216 236.21658]
PREDICTION_by_title ['Музыка и аудио: Мировые хиты'
 'События и достопримечательности: Личные события: День рождения'
 'Массовая культура: Скандалы знаменитостей']
SAMPLE $1000 шоу | 1 выпуск | Автобоулинг


SCORES [276.93286 271.74152 269.

#### Для формирования sample_submission будем брать только наилучшее предсказания для каждого видео
Сейчас у вас уже есть sample_submission с нужными для скоринга video_id, но пока нет информации о видео, она появится ближе к концу хакатона
Для примера прогоним через весь train dataset

In [11]:
topn=1
sample_submission = pd.DataFrame(data=data['video_id'].to_list(), columns=['video_id'])
sample_submission['predicted_tags']=np.nan
sample_submission['predicted_tags'] = sample_submission['predicted_tags'].astype('object')

for i, row in data.iterrows():
    scores, predictions = index.search(np.array([row['title_vector']]), topn)
    index_i = sample_submission[sample_submission.video_id == row.video_id].index
    sample_submission.at[index_i[0], 'predicted_tags'] = [tags_list[predictions[0][0]]] # вытаскиваем предсказание из 

In [29]:
print(sample_submission.head(5))


                           video_id  \
0  9007f33c8347924ffa12f922da2a179d   
1  9012707c45233bd601dead57bc9e2eca   
2  e01d6ebabbc27e323fa1b7c581e9b96a   
3  a00b145242be3ebc3b311455e94917af   
4  b01a682bf4dfcc09f1e8fac5bc18785a   

                                          tags  
0                  [Семья и отношения: Развод]  
1  [Массовая культура: Скандалы знаменитостей]  
2                  [Семья и отношения: Развод]  
3               [Музыка и аудио: Мировые хиты]  
4             [Спорт: Спортивное оборудование]  


#### В predicted_tags нужно записывать list тегов, например ['Карьера: Cтажировки', 'Карьера: Составление резюме'] или ['Массовая культура: Сериалы']

In [25]:
sample_submission.to_csv("sample_submission.csv", index_label=0)

In [19]:

!ls


IAB_tags.csv			requirements.txt
baseline-newembeddings.ipynb	train_data_categories.csv
baseline.ipynb			скрипт_проверки_Rutube.ipynb
download_trimmed_dataset.ipynb


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
def split_tags(tag_list):
    final_tag_list = []
    for tag in tag_list:
        tags = tag.split(": ")
        if len(tags) == 3:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
            final_tag_list.append(tags[0]+ ": " + tags[1] + ": " + tags[2])
        elif len(tags) == 2:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
        elif len(tags) == 1:
            final_tag_list.append(tags[0])
        else:
            print("NOT IMPLEMENTED!!!!", tag)
    return final_tag_list

In [122]:
import pandas as pd
import argparse
import ast
import numpy as np

def iou_metric(ground_truth, predictions):
    iou =  len(set.intersection(set(ground_truth), set(predictions)))
    iou = iou/(len(set(ground_truth).union(set(predictions))))
    print(iou, ground_truth, predictions)
    return iou

def split_tags(tag_list):
    final_tag_list = []
    for tag in tag_list:
        tags = [tag.strip().lower() for tag in tag.split(":")]
        if len(tags) == 3:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
            final_tag_list.append(tags[0]+ ": " + tags[1] + ": " + tags[2])
        elif len(tags) == 2:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
        elif len(tags) == 1:
            final_tag_list.append(tags[0])
        else:
            print("NOT IMPLEMENTED!!!!", tag)
    return final_tag_list


def find_iou_for_sample_submission(pred_submission, true_submission):
    ground_truth_df = true_submission
    ground_truth_df["tags"] = ground_truth_df["tags"].apply(lambda l: l.split(', '))
    ground_truth_df["tags_split"] = ground_truth_df["tags"].apply(lambda l: split_tags(l))

    predictions_df = pred_submission
    # predictions_df["predicted_tags"] = predictions_df["predicted_tags"].apply(ast.literal_eval)
    predictions_df["predicted_tags_split"] = predictions_df["predicted_tags"].apply(lambda l: split_tags(l))
    iou=0
    counter = 0
    for i, row in ground_truth_df.iterrows():
        predicted_tags = predictions_df[predictions_df["video_id"]==row["video_id"]]["predicted_tags_split"].values[0]
        iou_temp=iou_metric(row['tags_split'], predicted_tags)
        iou+=iou_temp
        counter+=1

    return iou/counter


# if __name__ == '__main__':

try:
    pred_submission = sample_submission
except Exception:
    assert False, 'Ошибка при загрузке решения участника'
try:
    true_submission = pd.read_csv("baseline/train_data_categories.csv").dropna()
except Exception:
    assert False, 'Ошибка при загрузке эталонного решения'


final_score = find_iou_for_sample_submission(pred_submission, true_submission)
print("FINAL_SCORE", final_score ) #final_score)


AssertionError: Ошибка при загрузке решения участника

##  knn on embeddings


In [5]:
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier


In [16]:
categories = pd.read_csv("baseline/IAB_tags.csv")
categories.head(5)

Unnamed: 0,Уровень 1 (iab),Уровень 2 (iab),Уровень 3 (iab)
0,Транспорт,,
1,Транспорт,Типы кузова автомобиля,
2,Транспорт,Типы кузова автомобиля,Грузовой автомобиль
3,Транспорт,Типы кузова автомобиля,Седан
4,Транспорт,Типы кузова автомобиля,Универсал


In [130]:
def create_tags_to_labels():
    tags = {}
    for i, row in tqdm(taxonomy.iterrows()):
        if isinstance(row['Уровень 3 (iab)'], str):
            tags[row['Уровень 1 (iab)'].strip().lower()+ ": "+row['Уровень 2 (iab)'].strip().lower()+": "+row['Уровень 3 (iab)'].strip().lower()] = i
        elif isinstance(row['Уровень 2 (iab)'], str):
            tags[row['Уровень 1 (iab)'].strip().lower()+ ": "+row['Уровень 2 (iab)'].strip().lower()] = i
        elif isinstance(row['Уровень 1 (iab)'], str):
            tags[row['Уровень 1 (iab)'].strip().lower()] = i
        # if isinstance(row['Уровень 1 (iab)'], str):
        #     # tags[i] = row['Уровень 1 (iab)']
        #     tags.append(row['Уровень 1 (iab)'].strip())
        # if isinstance(row['Уровень 2 (iab)'], str):
        #     # tags[i] = row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)']
        #     tags.append(row['Уровень 1 (iab)'].strip()+ ": "+row['Уровень 2 (iab)'].strip())
        # if isinstance(row['Уровень 3 (iab)'], str):
        #     # tags[row['Уровень 1 (iab)']+ ": "+row['Уровень 2 (iab)']+": "+row['Уровень 3 (iab)']] = i
        #     tags.append(row['Уровень 1 (iab)'].strip()+ ": "+row['Уровень 2 (iab)'].strip()+": "+row['Уровень 3 (iab)'].strip())
    return tags

# labels_to_tags = create_labels_tags()
tags_to_labels = create_tags_to_labels()
tags_to_labels["фильмы и анимация: фильмы и анимация"] = tags_to_labels["фильмы и анимация"]
labels_to_tags = {str(v): k for k, v in tags_to_labels.items()}

0it [00:00, ?it/s]

In [7]:
class EmbeddingStorage:
    def __init__(self, labels=None, filenames=None, embeddings=None):
        """
        Initialize the EmbeddingStorage class.
        
        Args:
            labels (list or np.ndarray): An array of labels for the embeddings.
            filenames (list or np.ndarray): An array of filenames associated with the embeddings.
            embeddings (np.ndarray): A NumPy array containing all embeddings.
        """
        self.labels = np.array(labels) if labels is not None else np.array([])
        self.filenames = np.array(filenames) if filenames is not None else np.array([])
        self.embeddings = np.array(embeddings) if embeddings is not None else np.empty((0,))

    def add_embedding(self, label, filename, embedding):
        """
        Add a new embedding, along with its label and filename.
        
        Args:
            label (int or str): The label of the embedding.
            filename (str): The filename associated with the embedding.
            embedding (np.ndarray or torch.Tensor): The embedding to add (can be a NumPy array or Tensor).
        """
        if isinstance(embedding, np.ndarray):
            emb_array = embedding
        else:
            # Convert torch.Tensor to NumPy
            emb_array = embedding.cpu().numpy()
        
        # Append the new data
        self.labels = np.append(self.labels, label)
        self.filenames = np.append(self.filenames, filename)
        
        if self.embeddings.size == 0:
            self.embeddings = emb_array.reshape(1, -1)
        else:
            self.embeddings = np.vstack([self.embeddings, emb_array])

    def save_to_file(self, file_path):
        """
        Save the embeddings, labels, and filenames to a file (as .npz).
        
        Args:
            file_path (str): The path to save the .npz file.
        """
        np.savez(file_path, labels=self.labels, filenames=self.filenames, embeddings=self.embeddings)

    @classmethod
    def load_from_file(cls, file_path):
        """
        Load embeddings, labels, and filenames from a saved .npz file.
        
        Args:
            file_path (str): The path to the .npz file to load.
        
        Returns:
            EmbeddingStorage: An instance of EmbeddingStorage with loaded data.
        """
        data = np.load(file_path)
        return cls(labels=data['labels'], filenames=data['filenames'], embeddings=data['embeddings'])

    def get_embedding_by_filename(self, filename):
        """
        Retrieve an embedding by its associated filename.
        
        Args:
            filename (str): The filename to search for.
        
        Returns:
            np.ndarray: The corresponding embedding or None if not found.
        """
        if filename in self.filenames:
            idx = np.where(self.filenames == filename)[0][0]
            return self.embeddings[idx]
        else:
            return None

    def join_on_videoname(self, other_storage):
        """
        Join two EmbeddingStorage objects on the 'videoname' (filename). The embeddings will be stored as tuples.
        
        Args:
            other_storage (EmbeddingStorage): Another EmbeddingStorage object to join with.
        
        Returns:
            EmbeddingStorage: A new EmbeddingStorage object with merged data (embedding tuples).
        """
        # Find common filenames
        common_filenames = np.intersect1d(self.filenames, other_storage.filenames)
        
        # Initialize lists to store merged data
        merged_labels = []
        merged_filenames = []
        merged_embeddings = []
        
        for filename in common_filenames:
            # Get embeddings for the common filename from both storages
            idx_self = np.where(self.filenames == filename)[0][0]
            idx_other = np.where(other_storage.filenames == filename)[0][0]
            
            emb_self = self.embeddings[idx_self]
            emb_other = other_storage.embeddings[idx_other]
            
            # Store embeddings as a tuple
            merged_embedding = (emb_self, emb_other)
            
            # Get the label from the first storage (could be changed based on use case)
            merged_label = self.labels[idx_self]
            
            # Append to the merged data
            merged_labels.append(merged_label)
            merged_filenames.append(filename)
            merged_embeddings.append(merged_embedding)
        
        # Convert lists to numpy arrays
        merged_labels = np.array(merged_labels)
        merged_filenames = np.array(merged_filenames)
        merged_embeddings = np.array(merged_embeddings, dtype=object)
        
        # Return a new EmbeddingStorage instance with merged data
        return EmbeddingStorage(labels=merged_labels, filenames=merged_filenames, embeddings=merged_embeddings)

    def __len__(self):
        """
        Return the number of embeddings stored.
        """
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Retrieve the label, filename, and embedding by index.
        
        Args:
            idx (int): The index of the embedding to retrieve.
        
        Returns:
            tuple: A tuple containing (label, filename, embedding).
        """
        if idx >= len(self.labels):
            raise IndexError("Index out of range")
        return self.labels[idx], self.filenames[idx], self.embeddings[idx]

    def __repr__(self):
        return f"EmbeddingStorage(labels={len(self.labels)}, filenames={len(self.filenames)}, embeddings_shape={self.embeddings.shape})"

In [8]:
vivit = EmbeddingStorage.load_from_file("vivit.npz")

In [163]:
Y_prepare = []
max_len = 10
for filename in vivit.filenames:
    _y = [-1 for _ in range(10)]
    try:
        index = 0
        for tag in split_tags(filter(lambda x: x != "", train_data["tags"][train_data["video_id"] == filename[:-4]].values[0].split(","))):
            if label := tags_to_labels.get(tag.strip().lower()):
                _y[index] = label
                index += 1
        # for i, v in enumerate(_y):
        #     _y[i] = v
        
    except IndexError as e:
        print(e)
        print(filename)
        print("wrong dataset row")
    Y_prepare.append(_y)


index 0 is out of bounds for axis 0 with size 0
b4d70f82038d1d97f1b3ce2a493d12c8.mp4
wrong dataset row


In [169]:
Y

array([[121, 135, 429, ...,  -1,  -1,  -1],
       [398, 137, 150, ...,  -1,  -1,  -1],
       [398, 406,  -1, ...,  -1,  -1,  -1],
       ...,
       [566, 577, 579, ...,  -1,  -1,  -1],
       [398,  -1,  -1, ...,  -1,  -1,  -1],
       [398, 346,  -1, ...,  -1,  -1,  -1]])

In [167]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold

seed = 42
n_splits = 2
iou_total = 0


def calc_iou(y_true, y_pred):
    iou = 0
    print(y_true, y_pred)
    for ground_truth, prediction in zip(y_true, y_pred):
        iou += iou_metric(ground_truth, prediction)
    return iou / len(y_true)


kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
X: np.array = vivit.embeddings
# Y: np.array = np.array([
#     [tags_to_labels[tag] for tag in split_tags(train_data["tags"][train_data["video_id"] == y[:-4]].values[0])] 
#     for y in vivit.filenames])
# Y = np.array(Y_prepare)
Y = np.array(Y_prepare)
for train_indices, test_indices in kf.split(np.arange(X.shape[0]), Y.shape[0], ):
    ...
    # X_train, X_test = X[train_indices], X[test_indices]
    # print(X_train.shape, X_test.shape)
    # print(train_indices, test_indices)
    # Y_train_labels, Y_test_labels = Y[train_indices], Y[test_indices]
    # # Y_train_labels = [tags_to_labels[y] for y in Y_train]
    # # Y_test_labels = [tags_to_labels[y] for y in Y_test]
    # model = KNeighborsClassifier(n_neighbors=3)
    # model.fit(X_train, Y_train_labels)
    # # Y_pred = np.argsort(model.predict_proba(X_test), axis=1)[:, :5]
    # probas = model.predict_proba(X_test)
    # Y_pred_labels = np.argsort(model.predict_proba(X_test), axis=1)[:, :5]
    # Y_pred = [split_tags([labels_to_tags[y]])for y in Y_pred_labels]
    # Y_test_labels_splitted = [split_tags([labels_to_tags[y]]) for y in Y_test_labels]
    # # print(classification_report(Y_test, Y_pred))
    # iou = calc_iou(Y_pred, Y_test_labels_splitted)
    # iou_total += iou
    # print(f"test len: {len(Y_test_labels_splitted)}, len train: {len(Y_train_labels)}")

# iou_total /= n_splits

# print(f"IoU: {iou_total}")

TypeError: Singleton array array(1049) cannot be considered a valid collection.

In [175]:
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold
from itertools import combinations

def create_label_combinations(y):
    """Create a single label for each sample based on its combination of labels."""
    return [''.join(map(str, row[row != -1])) for row in y]

def stratified_multi_label_kfold(X, Y, n_splits=5, random_state=None):
    # Create label combinations
    y_combinations = create_label_combinations(Y)
    
    # Use StratifiedKFold on the label combinations
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    for train_index, test_index in skf.split(X, y_combinations):
        yield train_index, test_index

# Number of folds
k = 5

# Create the k-fold splits

X = vivit.embeddings
Y = np.array(Y_prepare)
kfold_splits = list(stratified_multi_label_kfold(X, Y, n_splits=k, random_state=seed))

# Example of how to use the splits
for fold, (train_index, test_index) in enumerate(kfold_splits):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    print(f"Fold {fold + 1}:")
    print(f"  Train set shape: X {X_train.shape}, Y {Y_train.shape}")
    print(f"  Test set shape:  X {X_test.shape}, Y {Y_test.shape}")
    
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(X_train, Y_train)
    # Y_pred = np.argsort(model.predict_proba(X_test), axis=1)[:, :5]
    probas = model.predict_proba(X_test)
    print(probas)
    print(np.argsort(probas, axis=1)[:, :5])
    # Y_pred_labels = np.argsort(model.predict_proba(X_test), axis=1)[:, :5]
    # Y_pred = [split_tags([labels_to_tags[y]])for y in Y_pred_labels]
    # Y_test_labels_splitted = [split_tags([labels_to_tags[y]]) for y in Y_test_labels]
    # # print(classification_report(Y_test, Y_pred))
    # iou = calc_iou(Y_pred, Y_test_labels_splitted)
    # iou_total += iou
    # print(f"test len: {len(Y_test_labels_splitted)}, len train: {len(Y_train_labels)}")

# Verify that each sample appears in the test set exactly once
test_counts = defaultdict(int)
for _, test_index in kfold_splits:
    for idx in test_index:
        test_counts[idx] += 1

assert all(count == 1 for count in test_counts.values())

Fold 1:
  Train set shape: X (839, 768), Y (839, 10)
  Test set shape:  X (210, 768), Y (210, 10)
[array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]]), array([[0.66666667, 0.        , 0.        , ..., 0.        , 0.        ,
        



ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (10, 210) + inhomogeneous part.

In [111]:

tags_to_labels["Образование: Онлайн-образование"]



135

In [97]:
Y

array([208, 208, 208, ..., 208, 208, 208])

In [74]:
Y_test_labels

array([208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
       208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 20

In [70]:
# all_labels.

AttributeError: 'DataFrame' object has no attribute 'tolist'