In [168]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score

In [169]:
from huggingsound import SpeechRecognitionModel

model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
audio_paths = ["dataset/hr_bot_clear/3a0cb44f-76ff-11ee-844c-c09bf4619c03.mp3"]

10/12/2024 16:25:36 - INFO - huggingsound.speech_recognition.model - Loading model...


In [170]:
# Загрузите данные из файла CSV
files: list[str] = [
    "hr_bot_noise",
    "hr_bot_clear",
    # "hr_bot_synt",
    # "dataset/annotation/luga.json",
]

li = []
for filename in files:
    json_path = f"dataset/annotation/{filename}.json"
    df = pd.read_json(json_path)
    df["audio_filepath"] = df["audio_filepath"].apply(lambda x: f"dataset/{filename}/{x}")
    li.append(df)
data = pd.concat(li, axis=0, ignore_index=True)
data = data.sample(frac=1)
data.head()

Unnamed: 0,audio_filepath,id,text,label,attribute
48,dataset/hr_bot_noise/50a3b202-76fe-11ee-bbaa-c...,50a3b202-76fe-11ee-bbaa-c09bf4619c03,осадить на девять вагонов,4,9
1086,dataset/hr_bot_clear/65ecbc1d-76ff-11ee-87b1-c...,65ecbc1d-76ff-11ee-87b1-c09bf4619c03,продолжаем осаживание,5,-1
1208,dataset/hr_bot_clear/39039b4f-76ff-11ee-9039-c...,39039b4f-76ff-11ee-9039-c09bf4619c03,протянуть на три вагона,10,3
2591,dataset/hr_bot_clear/44adf89b-76ff-11ee-8943-c...,44adf89b-76ff-11ee-8943-c09bf4619c03,отцепка,11,-1
11,dataset/hr_bot_noise/6a08ef79-76fe-11ee-9305-c...,6a08ef79-76fe-11ee-9305-c09bf4619c03,назад на башмак,12,-1


In [171]:
data["transcription"] = model.transcribe(data["audio_filepath"])

100%|██████████| 2655/2655 [20:03<00:00,  2.21it/s]


In [172]:
data["transcription"] = data["transcription"].apply(lambda x: x["transcription"])
data.to_csv("dataset/transcribed.csv")
data.head()

Unnamed: 0,audio_filepath,id,text,label,attribute,transcription
48,dataset/hr_bot_noise/50a3b202-76fe-11ee-bbaa-c...,50a3b202-76fe-11ee-bbaa-c09bf4619c03,осадить на девять вагонов,4,9,вас садите на девять вагонов
1086,dataset/hr_bot_clear/65ecbc1d-76ff-11ee-87b1-c...,65ecbc1d-76ff-11ee-87b1-c09bf4619c03,продолжаем осаживание,5,-1,продолжаем осаживание
1208,dataset/hr_bot_clear/39039b4f-76ff-11ee-9039-c...,39039b4f-76ff-11ee-9039-c09bf4619c03,протянуть на три вагона,10,3,протянуть на три вагон
2591,dataset/hr_bot_clear/44adf89b-76ff-11ee-8943-c...,44adf89b-76ff-11ee-8943-c09bf4619c03,отцепка,11,-1,отцепка
11,dataset/hr_bot_noise/6a08ef79-76fe-11ee-9305-c...,6a08ef79-76fe-11ee-9305-c09bf4619c03,назад на башмак,12,-1,назад на башмак


In [175]:
import joblib
from sklearn.metrics import r2_score
from label2id import label2id
t2l_model = joblib.load("text2label.pkl")
labels = data["transcription"].apply(lambda x: t2l_model.predict([x])[0]).apply(label2id)
r2_score(labels, data["label"])

0.703522423865256

In [176]:
import joblib
from label2id import label2id
t2a_model = joblib.load("text2attr.pkl")
attrs = data["transcription"].apply(lambda x: t2a_model.predict([x])[0])
r2_score(attrs, data["attribute"])

0.7747477677184256