In [1]:
# Import necessary libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing sets
from sklearn.preprocessing import LabelEncoder  # For converting genre labels from strings to integers
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer  # From Hugging Face transformers
import tensorflow as tf  # TensorFlow library

2023-11-25 19:14:44.818488: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-25 19:14:44.818533: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-25 19:14:44.818623: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-25 19:14:44.868121: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Constants
MODEL_NAME = "DeepPavlov/rubert-base-cased"  # Name of the pre-trained RuBERT model
MAX_LENGTH = 150  # Maximum length of tokens for each text input

In [3]:
# Load and preprocess the dataset
df = pd.read_csv("../ArchiveProcessor/dataset.csv")  # Load the CSV file into a pandas DataFrame

In [4]:
# drop all non-string records, unclear where they are coming from
df = df[df.apply(lambda row: isinstance(row['body'], str) and isinstance(row['genre'], str), axis=1)]

In [5]:
df['is_sf'] = df['genre'].apply(lambda genres: any(genre.startswith('sf') for genre in genres.split(',')))

In [6]:
df

Unnamed: 0,body,genre,is_sf
0,"– сдавленно сказал Ром, застыв на полусогнутых...",sf,True
1,что куда-то запропастился паяльник. Жена у тел...,sf,True
2,настежь окно совсем не спасало. По улице гонял...,sf,True
3,"силы и обреченно перетекал в душный вечер, за ...",sf,True
4,"осела на ягоды земляники у обочины, просеялась...",sf,True
...,...,...,...
32311,"остаться в живых? Вопрос философский, не так л...",sf_fantasy,True
32312,"рассказа, в котором отсутствует половина текст...",sf,True
32313,сенаторы перестают отвечать квалификационным т...,sf,True
32314,FB2 by Sclex 1.1 — вычитка ошибок by Sclex 1.2...,sf,True


In [7]:
# Label Encoding for genres
label_encoder = LabelEncoder()  # Initialize the LabelEncoder
df['labels'] = label_encoder.fit_transform(df['is_sf'])  # Convert genre strings to integers

In [8]:
df[df['labels'] == 0]

Unnamed: 0,body,genre,is_sf,labels
25,всех их объединяет одна тема – тема «маленьког...,russian_contemporary,False,0
27,художественная и вместе с тем строго документи...,prose_history,False,0
29,что не все в жизни меняется с течением времени...,"prose_contemporary,religion_rel",False,0
36,"этот глупец, мало зная, с кем он имеет дело, и...",adv_history,False,0
37,"нет. Но, быть может, эти два романа не совсем ...",prose_contemporary,False,0
...,...,...,...,...
32304,"тех, кто смело бросает вызов судьбе. Скажете, ...",love_contemporary,False,0
32305,"перо</strong></p> <p>Глава 1</p> Когда, наконе...",det_classic,False,0
32306,армия исполнила свой долг сполна — отрезанные ...,"sci_history,nonf_biography",False,0
32307,то время как юные король и королева Северных ц...,child_tale,False,0


In [9]:
# Splitting the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df['body'], df['labels'], test_size=0.2)  

In [10]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)  # Load the tokenizer for RuBERT


In [11]:
# Tokenization function
def tokenize(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors='tf')  # Tokenize the texts and convert to TensorFlow tensors

In [12]:
# Tokenize data
train_encodings = tokenize(train_texts.tolist())  # Tokenize training texts
val_encodings = tokenize(val_texts.tolist())  # Tokenize validation texts

2023-11-25 19:15:13.864222: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-25 19:15:13.887195: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-25 19:15:13.887251: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-25 19:15:13.890897: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-25 19:15:13.890936: I tensorflow/compile

In [13]:
# Convert labels to TensorFlow format
train_labels = tf.convert_to_tensor(train_labels)  # Convert training labels to TensorFlow tensors
val_labels = tf.convert_to_tensor(val_labels)  # Convert validation labels to TensorFlow tensors


In [14]:
# Load RuBERT model
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_NAME, from_pt=True, num_labels=len(label_encoder.classes_))  # Load RuBERT model with a classification head for the number of genres


2023-11-25 19:15:29.374773: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-11-25 19:15:31.483856: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 367248384 exceeds 10% of free system memory.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not

In [15]:
# Model compilation
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)  # Define the optimizer (Adam) with a specific learning rate
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)  # Define the loss function
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])  # Compile the model with the optimizer, loss function, and accuracy metric

In [None]:
# Training
model.fit(
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},  # Provide training inputs and attention masks
    train_labels,  # Provide training labels
    validation_data=({'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask']}, val_labels),  # Provide validation data
    batch_size=16,  # Set batch size
    epochs=8  # Set number of epochs
)

Epoch 1/12


2023-11-25 19:15:48.258316: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55c2936260e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-11-25 19:15:48.258347: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2070, Compute Capability 7.5
2023-11-25 19:15:48.275931: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-25 19:15:48.454223: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2023-11-25 19:15:48.509399: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/12
Epoch 3/12
 345/1616 [=====>........................] - ETA: 15:07 - loss: 0.0479 - accuracy: 0.9882

In [None]:
model.save_pretrained("rubart-flibusta-genres.model")

In [19]:
numbers = [
    753030, 753029, 752121, 497148, 501398, 752122, 752120, 552329, 753031, 753032, 755720, 644955, 755721, 710622,
    711519, 752119, 753028, 755719, 532471, 450323, 450324, 450325, 553516, 556597, 636576, 450327, 619971, 582607,
    693349, 693361, 693346, 477565, 476978, 475108, 693348, 478622, 693359, 693344, 693347, 743384, 741717, 590118,
    554672, 554770, 554769, 566760, 588453, 648325, 533343, 591540, 608475, 552981, 539817, 552979, 622698, 664507,
    556768, 564146, 577584, 599185, 616366, 617347, 637845, 648854, 691559, 754783, 548574, 525064, 564339, 586199,
    622456, 657391, 730122, 577776, 600436, 600437, 601422, 609214, 609585, 623736, 630100, 660951, 678314, 678312,
    706500, 740257, 583533, 582971, 585998, 588150, 591195, 597454, 602185, 603839, 606680, 616633, 637499, 676093,
    600900, 616092, 608919, 622817, 634955, 643862, 750497, 665066, 665068, 613855, 665154, 665155, 665210, 665069,
    665052, 665153, 665209, 640360, 660091, 647975, 692823, 693301, 726412, 726413, 695574, 676143, 693509, 676873,
    693510, 711465, 733035, 701371, 740051, 743577, 692730, 692731, 703519, 708175, 710418, 711534, 722711, 724275,
    732302, 735681, 747817, 750426, 754174, 754194, 732545, 746165, 732270, 732269, 730317, 732268, 743387, 743386,
    758399, 707015, 713278, 726235, 746408, 746409, 750064, 749724, 756178
]

# Convert each number to a string and append ".fb2"
file_names = [f"\"{num}.fb2\"" for num in numbers]

# Joining the list into a single string for display
formatted_list = ", ".join(file_names)
print("files := []string{" + formatted_list + "}")

files := []string{"753030.fb2", "753029.fb2", "752121.fb2", "497148.fb2", "501398.fb2", "752122.fb2", "752120.fb2", "552329.fb2", "753031.fb2", "753032.fb2", "755720.fb2", "644955.fb2", "755721.fb2", "710622.fb2", "711519.fb2", "752119.fb2", "753028.fb2", "755719.fb2", "532471.fb2", "450323.fb2", "450324.fb2", "450325.fb2", "553516.fb2", "556597.fb2", "636576.fb2", "450327.fb2", "619971.fb2", "582607.fb2", "693349.fb2", "693361.fb2", "693346.fb2", "477565.fb2", "476978.fb2", "475108.fb2", "693348.fb2", "478622.fb2", "693359.fb2", "693344.fb2", "693347.fb2", "743384.fb2", "741717.fb2", "590118.fb2", "554672.fb2", "554770.fb2", "554769.fb2", "566760.fb2", "588453.fb2", "648325.fb2", "533343.fb2", "591540.fb2", "608475.fb2", "552981.fb2", "539817.fb2", "552979.fb2", "622698.fb2", "664507.fb2", "556768.fb2", "564146.fb2", "577584.fb2", "599185.fb2", "616366.fb2", "617347.fb2", "637845.fb2", "648854.fb2", "691559.fb2", "754783.fb2", "548574.fb2", "525064.fb2", "564339.fb2", "586199.fb2", "6