In [None]:
import sys
sys.path.append('../')

import pandas as pd
import matplotlib.pyplot as plt
import torch

from transformers import pipeline, FlaubertModel, FlaubertTokenizer
from datasets import load_dataset
from tqdm import tqdm
from src.helper import *

# Dataset

https://github.com/clinc/oos-eval#an-evaluation-dataset-for-intent-classification-and-out-of-scope-prediction

In [None]:
# Load the dataset
clinc150_dataset = load_dataset("clinc_oos", 'plus')

In [None]:
split = "train"

# Get the data
df = clinc150_dataset[split].to_pandas()

# Get the class names and create a dictionary to map the class index to the class name
labels = clinc150_dataset[split].features["intent"].names
labels = {i: name for i, name in enumerate(labels)}

# Add a new column to the dataframe with the class name
df["label"] = df["intent"].map(labels)

# Drop the intent column
df = df.drop("intent", axis=1)

# Print the first rows of the dataframe
df.head()

In [None]:
# Plot the distribution of text length
df["text"].str.len().hist(bins=30)
plt.xlabel("Text length")
plt.ylabel("Number of examples")
plt.title(f"Text length distribution of the {split} set")
plt.show()

In [None]:
# Plot the distribution of the classes
plot_class_distribution(df, f"Distribution of the classes in the {split} set")

In [None]:
# Set the class name to oos and intent as 'oos''s index for the examples not in the classes of interest
df.loc[~df["label"].isin(CLASSES_OF_INTEREST), "label"] = "oos"

# Plot the distribution of the classes
plot_class_distribution(df, f"Distribution of the classes in the {split} set")

In [None]:
# Downsample the oos class to have the same number of examples as the classes of interest
df = pd.concat([
    df[df["label"] != "oos"],
    df[df["label"] == "oos"].sample(n=len(df[df["label"] != "oos"]), replace=True, random_state=RANDOM_STATE)
])

# Plot the distribution of the classes
plot_class_distribution(df, f"Distribution of the classes in the {split} set")

In [None]:
# Save the dataframe as a csv file
df.to_csv(f"../data/clinc150_{split}_down.csv", index=False)

# Translation

## Model choice

Evaluating which model is the best based on the given evaluation scores requires considering both the BLEU (Bilingual Evaluation Understudy) and chr-F scores across different test sets. The BLEU score is a metric for evaluating the quality of machine-translated text, with a higher BLEU score indicating better translation quality. The chr-F score is another evaluation metric that considers character n-gram precision and recall, with a higher chr-F score indicating better translation quality as well.

Model 1: https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-en-fr
Has more diverse test sets, including both news and non-news domains.
Generally achieves higher BLEU and chr-F scores across most of the test sets compared to Model 2.
Demonstrates particularly strong performance on the multi30k_test_2017_mscoco test set.

Model 2: https://huggingface.co/Helsinki-NLP/opus-mt-en-fr
Has fewer test sets for evaluation and they are mainly focused on news domains.
Achieves comparable BLEU and chr-F scores to Model 1 on the overlapping news domain test sets.
Shows a strong performance on the Tatoeba.en.fr test set.

Considerations
Diversity of Test Sets: Model 1 has been evaluated on a wider variety of test sets, which could provide a more comprehensive understanding of its performance across different domains.
Score Comparisons: On the overlapping test sets (news domain), the two models have comparable performance, with Model 1 having a slight edge in most cases.

Domain Specificity: If the intended application of the model is in a specific domain (e.g., news translation), then the performance on the relevant test sets should be weighted more heavily.

Conclusion
Overall: Model 1 seems to be the better choice given its higher scores across a diverse set of test sets.
Domain-Specific: If the application is focused on translating sentences similar to those in the Tatoeba test set, Model 2 might be the better choice as it has a higher BLEU and chr-F score on that specific test set.
To make a more definitive conclusion, one could consider additional factors such as the model's efficiency, resource requirements, and any potential biases in the test sets.

In [None]:
# Create a translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-fr")

# Create a new column for the translated text
df["text_fr"] = None

# Reset index
df = df.reset_index(drop=True)

# Translate the text to French and save it in the text_fr column
# Use a for loop and tqdm to track the progress
for i, row in tqdm(df.iterrows(), total=len(df)):    
    df.loc[i, "text_fr"] = translator(df.at[i, "text"])[0]["translation_text"]

# Print the first rows of the dataframe
df.head()

In [None]:
# Save the dataframe as a csv file
filename = f"../data/clinc150_{split}_down_tr.csv"
# df.to_csv(filename, index=False)

# Load the dataframe
df = pd.read_csv(filename)

In [None]:
df[df["label"] == "flight_status"].sample(10)

# Embeddings

We use FlauBERT to obtain embeddings from the translated sentences

In [None]:
modelname = 'flaubert/flaubert_base_uncased' 

# Load pretrained model and tokenizer
flaubert, log = FlaubertModel.from_pretrained(modelname, output_loading_info=True)
flaubert_tokenizer = FlaubertTokenizer.from_pretrained(modelname, do_lowercase=True)
# do_lowercase=False if using cased models, True if using uncased ones

sentence = "Le chat mange une pommmme."
token_ids = torch.tensor([flaubert_tokenizer.encode(sentence)])

last_layer = flaubert(token_ids)[0]  # [B, num_tokens, emb_dim]
print(last_layer.shape)

# print each token id and its corresponding token and make it readable
print()
for i, token_id in enumerate(token_ids[0]):
    print(i, token_id.numpy(), '\t', flaubert_tokenizer.decode(token_id))

# The BERT [CLS] token correspond to the first hidden state of the last layer
cls_embedding = last_layer[:, 0, :]

In [None]:
# add a new column to the dataframe with the list of embeddings
df['embeddings'] = df['text'].apply(lambda x: flaubert(torch.tensor([flaubert_tokenizer.encode(x)]))[0].tolist()[0])

In [None]:
# average the embeddings to avoid different magnitudes with different lengths
df['embeddings_avg'] = df['embeddings'].apply(lambda x: [sum(i)/len(i) for i in zip(*x)])

In [None]:
# save the dataframe as a csv
df.to_csv(f'../data/clinc150_{split}_down_tr_emb.csv', index=False)