### Обучение TextToLabel

In [26]:
import pandas as pd

files: list[str] = [
    "dataset/transcription/transcribed.csv",
    # "dataset/transcription/hr_bot_clear.json",
    # "dataset/transcription/hr_bot_noise.json",
    # "dataset/transcription/hr_bot_synt.json",
    # "dataset/transcription/luga.json",
]

li = []
for filename in files:
    df = pd.read_csv(filename)
    li.append(df)
df = pd.concat(li, axis=0, ignore_index=True)
df = df.sample(frac=1)
df.head()

Unnamed: 0.1,Unnamed: 0,audio_filepath,id,text,label,attribute,transcription
2140,2272,dataset/hr_bot_clear/405a908a-76ff-11ee-a0a0-c...,405a908a-76ff-11ee-a0a0-c09bf4619c03,вперед на башмак,15,-1,вперед на бошма
545,33,dataset/hr_bot_noise/7947f861-76fe-11ee-adc3-c...,7947f861-76fe-11ee-adc3-c09bf4619c03,осадить на девять вагонов,4,9,а садить на девять вагонов
2019,2552,dataset/hr_bot_clear/58f5d2d0-76ff-11ee-8318-c...,58f5d2d0-76ff-11ee-8318-c09bf4619c03,осадить на два вагона,4,2,осадить на два вагона
150,1779,dataset/hr_bot_clear/8cc12f91-76ff-11ee-8f59-c...,8cc12f91-76ff-11ee-8f59-c09bf4619c03,осадить на восемнадцать вагонов,4,18,асадить нам семнадцать вагонов
1726,1228,dataset/hr_bot_clear/4866f182-76ff-11ee-9a05-c...,4866f182-76ff-11ee-9a05-c09bf4619c03,осадить на шесть вагонов,4,6,а садить нам шесть вагонов


In [27]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import RandomForestClassifier

vectorizer = HashingVectorizer(
    lowercase=True, n_features=2**10, stop_words="english"
)
vectorizer = TfidfVectorizer(
    lowercase=True, sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english"
)
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

pipeline = Pipeline(
    [
        ("tfidf", vectorizer),
        ("classifier", classifier),
    ]
)

In [28]:
X = df["transcription"]
y = df["attribute"]

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

In [30]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

y_pred = pipeline.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average='weighted')
print(f"F1: {f1_weighted:.2f}")

scores = cross_val_score(pipeline, X, y, cv=5)
print('Cross val:', scores.mean())

F1: 0.89
Cross val: 0.8730696798493408


In [31]:
import joblib
joblib.dump(pipeline, "trained/text2attr.pkl")

['trained/text2attr.pkl']