In [1]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.10.1 sacrebleu-2.4.3


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
import sacrebleu

In [3]:
model_name = "/kaggle/input/m2m-bilingual"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = M2M100Tokenizer.from_pretrained(model_name)

In [4]:
df = pd.read_csv("/kaggle/input/overall-80k/overall_80K.csv", index_col=0)
df

Unnamed: 0,target,source
0,Та пыгрисит маим вармаль э̄рнэ поратэт ат верм...,Те мальчики не выполнят задание в назначенный ...
1,"Ха̄йтыматэ тӯр ва̄тан ёхтыс, вит ва̄тан ха̄йтыс.","Бегая к берегу озера пришла, к воде подбежала."
2,Вит са̄мыл сунсым о̄нтыс,Вода прибывала на глазах
3,"Атаявев, акваг лылынг тагл ворн та тотавев.","Обнюхивает нас, живыми на кладбище уносит."
4,"Ман ты пӣлтал, веськат хумиюв нэтхуньт ат ёр...",Мы никогда не забудем этого честного человека.
...,...,...
2345,А̄нумн ка̄салахты аквтуп тамле о̄лнэ накыт ма̄...,"Мне кажется, что подобные случаи могут вызыват..."
2346,А̄танэ нё̄тнэ̄г юил акван-атманэ.,Волосы аккуратно собраны сзади.
2347,"Тох тай, культура сака тэ̄пгалан мед а̄тим.","В общем, культуры интенсивного потребления мед..."
2348,"Тувыл Уэйтс ты музыкантыг ёт, Чарли Рич ос Фрэ...",Затем Уэйтс отправился на гастроли с такими му...


In [5]:
df['to'] = 'mns'

In [6]:
df_other = df.copy()
df_other['to'] = 'ru'
df_other['source'], df_other['target'] = df_other['target'], df_other['source']

# Concatenate the original df and df_other
df = pd.concat([df, df_other], ignore_index=True)

In [7]:
train_df, test_val_df = train_test_split(df, test_size=0.2, random_state=42,shuffle=True, stratify=df['to'])
val_df, test_df = train_test_split(test_val_df, test_size=0.5, random_state=42, shuffle=True, stratify=test_val_df['to'])

In [20]:
# fust for fast testing
a, b = train_test_split(test_df, test_size=0.1, random_state=42, shuffle=True, stratify=test_df['to'])

In [21]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

b = Dataset.from_pandas(b)

In [10]:
tokenizer.add_special_tokens({'additional_special_tokens': ['<mns_MNS>']})
model.resize_token_embeddings(len(tokenizer))

M2M100ScaledWordEmbedding(128105, 1024, padding_idx=1)

In [11]:
def preprocess_function(examples):
    inputs = tokenizer(examples['source'], truncation=True, padding='max_length', max_length=128)
    targets = tokenizer(examples['target'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = targets['input_ids']
    return inputs

In [12]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# b = b.map(preprocess_function, batched=True)

Map:   0%|          | 0/129833 [00:00<?, ? examples/s]

Map:   0%|          | 0/16229 [00:00<?, ? examples/s]

Map:   0%|          | 0/16230 [00:00<?, ? examples/s]

In [13]:
train_dataset = train_dataset.remove_columns(['source', 'target', '__index_level_0__', 'to'])
val_dataset = val_dataset.remove_columns(['source', 'target', '__index_level_0__', 'to'])
test_dataset = test_dataset.remove_columns(['source', 'target', '__index_level_0__', 'to'])

In [14]:
import torch
torch.cuda.empty_cache()

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./mansi_finetuned_biling",
    eval_strategy="no",  # отключаем валидацию 
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4, 
    weight_decay=0.01,
    save_total_limit=5, 
    save_strategy="epoch",  # сохраняем токо после каждой эпохи
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    report_to="none", 
    fp16=True, 
    load_best_model_at_end=False,  # вырубаем загрузку лучшей модели в конце
)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # убираем токены pad
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # вычисление метрик
    bleu = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels])
    chrf = sacrebleu.corpus_chrf(decoded_preds, [decoded_labels])

    return {"bleu": bleu.score, "chrf": chrf.score}

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Тренировка модели
trainer.train()

In [16]:
import shutil
from IPython.display import FileLink

shutil.make_archive("m2m_biling_finetune", 'zip', "/kaggle/working/mansi_finetuned_biling/")

'/kaggle/working/m2m_biling_finetune.zip'

In [17]:
FileLink('m2m_biling_finetune.zip')

In [None]:
import random
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

random.seed(42) 
random_indices = random.sample(range(len(test_dataset)), 1000)
random_test_samples = b

def generate_translation(sample):
    input_ids = torch.tensor(sample['input_ids']).unsqueeze(0).to(device)  # Конвертируем список в тензор
    with torch.no_grad():
        generated_ids = model.generate(input_ids, max_length=200, num_beams=5, early_stopping=True)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

original_texts = []
correct_translations = []
model_translations = []

for sample in tqdm(random_test_samples):
    input_text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
    correct_translation = tokenizer.decode(sample['labels'], skip_special_tokens=True)
    model_translation = generate_translation(sample)

    original_texts.append(input_text)
    correct_translations.append(correct_translation)
    model_translations.append(model_translation)
    
df_results = pd.DataFrame({
    "Original Text": original_texts,
    "Correct Translation": correct_translations,
    "Model Translation": model_translations
})

In [None]:
df_results

In [None]:
df_results.loc[:, 'Original Text'] = df_results['Original Text'].str.replace('__en__', '')
df_results.loc[:, 'Correct Translation'] = df_results['Correct Translation'].str.replace('__en__', '')
df_results.loc[:, 'Model Translation'] = df_results['Model Translation'].str.replace('__en__', '')
    
bleu_score = sacrebleu.corpus_bleu(df_results['Model Translation'].tolist(), 
                                   [df_results['Correct Translation'].tolist()]).score
chrf_score = sacrebleu.corpus_chrf(df_results['Model Translation'].tolist(), 
                                   [df_results['Correct Translation'].tolist()]).score

print(f"BLEU Score: {bleu_score}")
print(f"ChrF Score: {chrf_score}")

df_results.head(50)