In [None]:
import os
from pathlib import Path
os.chdir(Path(os.getcwd()).parent.parent.resolve())

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

from vinasmol.hfmodel import SMOLLM2
from vinasmol.tokenization.language import (
    token_language_classification,
    classification_model_name,
)

In [None]:
df = token_language_classification(SMOLLM2, batch_size=128)

In [None]:
df

In [None]:
threshold = 0.5
other_cat = f'other (< {threshold})'

In [None]:
confident_predictions = df[df['score'] > threshold]
training_languages = ['en-US']
confident_predictions['lang'][~confident_predictions['lang'].isin(training_languages)] = other_cat

In [None]:
lang_counts = confident_predictions['lang'].value_counts().to_frame().reset_index()
lang_counts

In [None]:
num_confident = len(confident_predictions)
num_other = lang_counts[lang_counts['lang'] == other_cat]['count'][0]
print(f"Vocabulary size: {len(df)}")
print(f"Tokens confidently classified (> {threshold}): {num_confident}")
print(f"Tokens confidently classified in training languages (> {threshold}): {num_confident - num_other}")

In [None]:
confident_predictions[confident_predictions['lang'] == other_cat]

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(confident_predictions['lang'].value_counts())
plt.xlabel(f"Language (predicted by {classification_model_name})")
plt.title(f"Token language distribution in {SMOLLM2.friendly_name}")

In [None]:
sns.histplot(df, x='score')
plt.title("Classifier score distribution")