In [1]:
# Import necessary libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing sets
from sklearn.preprocessing import LabelEncoder  # For converting genre labels from strings to integers
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer  # From Hugging Face transformers
import tensorflow as tf  # TensorFlow library

2023-11-26 10:39:17.511624: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-26 10:39:17.511670: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-26 10:39:17.512389: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-26 10:39:17.604604: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Constants
MODEL_NAME = "DeepPavlov/rubert-base-cased"  # Name of the pre-trained RuBERT model
MAX_LENGTH = 150  # Maximum length of tokens for each text input
MAX_TOKENS = 128

In [4]:
# Load and preprocess the dataset
df = pd.read_csv("../ArchiveProcessor/dataset.csv")  # Load the CSV file into a pandas DataFrame

In [5]:
# drop all non-string records, unclear where they are coming from
df = df[df.apply(lambda row: isinstance(row['body'], str) and isinstance(row['genre'], str), axis=1)]

In [6]:
df['is_sf'] = df['genre'].apply(lambda genres: any(genre.startswith('sf') for genre in genres.split(',')))

In [7]:
df

Unnamed: 0,body,genre,is_sf
0,"– сдавленно сказал Ром, застыв на полусогнутых...",sf,True
1,что куда-то запропастился паяльник. Жена у тел...,sf,True
2,настежь окно совсем не спасало. По улице гонял...,sf,True
3,"силы и обреченно перетекал в душный вечер, за ...",sf,True
4,"осела на ягоды земляники у обочины, просеялась...",sf,True
...,...,...,...
32311,"остаться в живых? Вопрос философский, не так л...",sf_fantasy,True
32312,"рассказа, в котором отсутствует половина текст...",sf,True
32313,сенаторы перестают отвечать квалификационным т...,sf,True
32314,FB2 by Sclex 1.1 — вычитка ошибок by Sclex 1.2...,sf,True


In [7]:
# Label Encoding for genres
label_encoder = LabelEncoder()  # Initialize the LabelEncoder
df['labels'] = label_encoder.fit_transform(df['is_sf'])  # Convert genre strings to integers

In [8]:
df[df['labels'] == 0]

Unnamed: 0,body,genre,is_sf,labels
25,всех их объединяет одна тема – тема «маленьког...,russian_contemporary,False,0
27,художественная и вместе с тем строго документи...,prose_history,False,0
29,что не все в жизни меняется с течением времени...,"prose_contemporary,religion_rel",False,0
36,"этот глупец, мало зная, с кем он имеет дело, и...",adv_history,False,0
37,"нет. Но, быть может, эти два романа не совсем ...",prose_contemporary,False,0
...,...,...,...,...
32304,"тех, кто смело бросает вызов судьбе. Скажете, ...",love_contemporary,False,0
32305,"перо</strong></p> <p>Глава 1</p> Когда, наконе...",det_classic,False,0
32306,армия исполнила свой долг сполна — отрезанные ...,"sci_history,nonf_biography",False,0
32307,то время как юные король и королева Северных ц...,child_tale,False,0


In [9]:
# Splitting the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df['body'], df['labels'], test_size=0.2)  

In [8]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)  # Load the tokenizer for RuBERT


In [11]:
# Tokenization function
def tokenize(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors='tf')  # Tokenize the texts and convert to TensorFlow tensors

def tokenize_and_segment(text):
    tokens = tokenizer.encode_plus(text, max_length=MAX_TOKENS, truncation=True, 
                                   padding='max_length', return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [12]:
df['tokens'] = df['body'].apply(lambda x: tokenize_and_segment(x)[0])
df['attention_mask'] = df['body'].apply(lambda x: tokenize_and_segment(x)[1])

2023-11-26 10:41:12.272302: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-26 10:41:12.295029: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-26 10:41:12.295079: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-26 10:41:12.298830: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-26 10:41:12.298868: I tensorflow/compile

In [12]:
# Tokenize data
train_encodings = tokenize(train_texts.tolist())  # Tokenize training texts
val_encodings = tokenize(val_texts.tolist())  # Tokenize validation texts

2023-11-25 19:15:13.864222: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-25 19:15:13.887195: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-25 19:15:13.887251: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-25 19:15:13.890897: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-25 19:15:13.890936: I tensorflow/compile

In [13]:
# Convert labels to TensorFlow format
train_labels = tf.convert_to_tensor(train_labels)  # Convert training labels to TensorFlow tensors
val_labels = tf.convert_to_tensor(val_labels)  # Convert validation labels to TensorFlow tensors


In [14]:
# Load RuBERT model
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_NAME, from_pt=True, num_labels=len(label_encoder.classes_))  # Load RuBERT model with a classification head for the number of genres


2023-11-25 19:15:29.374773: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-11-25 19:15:31.483856: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 367248384 exceeds 10% of free system memory.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not

In [15]:
# Model compilation
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)  # Define the optimizer (Adam) with a specific learning rate
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)  # Define the loss function
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])  # Compile the model with the optimizer, loss function, and accuracy metric

In [None]:
# Training
model.fit(
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},  # Provide training inputs and attention masks
    train_labels,  # Provide training labels
    validation_data=({'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask']}, val_labels),  # Provide validation data
    batch_size=16,  # Set batch size
    epochs=8  # Set number of epochs
)

Epoch 1/12


2023-11-25 19:15:48.258316: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55c2936260e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-11-25 19:15:48.258347: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2070, Compute Capability 7.5
2023-11-25 19:15:48.275931: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-25 19:15:48.454223: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2023-11-25 19:15:48.509399: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/12
Epoch 3/12
 345/1616 [=====>........................] - ETA: 15:07 - loss: 0.0479 - accuracy: 0.9882

In [None]:
model.save_pretrained("rubart-flibusta-genres.model")