In [None]:
from typing import List
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.embeddings import Embeddings
from langchain_core.documents import Document

class MiniLMEmbeddings(Embeddings):
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def embed_documents(self, texts: List[str], batch_size=500) -> List[List[float]]:
        # Use batching to speed up the encoding
        return self.model.encode(texts, batch_size=batch_size, show_progress_bar=True).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.embed_documents([text])[0]

data = pd.read_csv("train_data_v2_57k.csv")
#data, validation = train_test_split(data, test_size=250)
validation = data.loc[val_indeces]
data = data.drop(val_indeces)
print(len(validation))
print(len(data))

# Filter only rows where 'en_title' is a string
filtered_data = data[data['en_title'].apply(lambda x: isinstance(x, str))]

# Vectorize the creation of Document objects
docs = [
    Document(
        page_content=row['en_title'],
        metadata={'en_title': row['en_title'], 'fr_title': row['fr_title']}
    )
    for _, row in filtered_data.iterrows()
]

vec_store = InMemoryVectorStore(MiniLMEmbeddings())
vec_store.add_documents(documents=docs)

In [None]:
from langchain_core.prompts import ChatPromptTemplate
import openai
from dotenv import load_dotenv
import os

load_dotenv()
api_key=os.environ["LITELLM_KEY"]
base_url="os.environ["LITELLM_IP"]"

client = openai.OpenAI(
    api_key=api_key,
    base_url=base_url
)

def make_examples_section(docs):
    lines = []
    for doc in docs:
        lines.append(f"English: {doc.metadata['en_title']}")
        lines.append(f"French: {doc.metadata['fr_title']}\n")
    return "\n".join(lines)

def translate_with_few_shot(english_title, model="cheap-ian", k_neighbors=5, verbose=False):
    try:
        system_prompt_template = ("""You are a highly skilled translator specializing in technical and legal documents, with specific expertise in translating English patent titles into the most accurate and natural French equivalents. Always maintain the precise technical terminology and meaning of the original, adapting phrasing for French patent standards. Return only the title/word translated, match the casing, do not be conversational. Here are some examples:
                            \n{examples}""")
        prompt_template = ChatPromptTemplate.from_messages(
            [("system", system_prompt_template), ("user", "{text}")]
        )
        results = vec_store.similarity_search(query=english_title, k=k_neighbors)

        prompt = prompt_template.invoke({"examples":make_examples_section(results), "text":english_title})
        messages = prompt.to_messages()
        sys_msg = messages[0].content
        user_msg = messages[1].content

        if verbose:
            print("sys:")
            print(sys_msg)
            print("user:")
            print(user_msg)

        response = client.chat.completions.create(
            model=model,
            messages = [
                {
                    "role": "system",
                    "content": sys_msg
                },
                {
                    "role": "user",
                    "content": user_msg
                }
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(english_title)
        print(e)
        return ""

def translate_with_base_model(english_title, model="cheap-ian"):
    try:
        response = client.chat.completions.create(
            model=model,
            messages = [
                {
                    "role": "system",
                    "content": "You are a highly skilled translator specializing in technical and legal documents, with specific expertise in translating English patent titles into the most accurate and natural French equivalents. Always maintain the precise technical terminology and meaning of the original, adapting phrasing for French patent standards. Return only the title/word translated, match the casing, do not be conversational."
                },
                {
                    "role": "user",
                    "content": english_title
                }
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(english_title)
        print(e)
        return ""

#val_row = validation.iloc[-100]
#print("Sample English: {} \nSample French: {}\n".format(val_row["en_title"], val_row["fr_title"]))
#translate_with_base_model(val_row["en_title"])   

In [None]:
french_to_english = {
    "Bonjour": "Hello",
    "Maison": "House",
    "Chat": "Cat",
    "Livre": "Book",
    "École": "School",
    "Pomme": "Apple",
    "Chaise": "Chair",
    "Amour": "Love",
    "Eau": "Water",
    "Nuit": "Night"
}

for key, value in french_to_english.items():
    translation = translate_with_base_model(value, model="best")
    print(f"English: {value}, Translation: {translation}")

In [None]:
from bert_score import score
from rouge_score import rouge_scorer
import torch
import sacrebleu

device = 'cuda' if torch.cuda.is_available() else 'cpu'

r_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
chrf = sacrebleu.CHRF()

model = "best"
validation["fr_translate_base"] = validation["en_title"].apply(lambda x: translate_with_base_model(x, model=model))
validation["fr_translate_10-shot"] = validation["en_title"].apply(lambda x: translate_with_few_shot(x, model=model, k_neighbors=10))
validation["fr_translate_5-shot"] = validation["en_title"].apply(lambda x: translate_with_few_shot(x, model=model, k_neighbors=5))

In [None]:
def clean_output(input_text):
    try:
        text_without_asterisks = input_text.replace('**', '')
        text_without_asterisks = text_without_asterisks.replace('\n', '')
        french_index = text_without_asterisks.find('French:')
        if french_index != -1:
            text_after_french = text_without_asterisks[french_index + len('French:'):]
        else:
            text_after_french = text_without_asterisks
        french_index = text_without_asterisks.upper().find('French Translation:'.upper())
        if french_index != -1:
            text_after_french = text_after_french[french_index + len('French Translation:'):]
        else:
            text_after_french = text_after_french
        final_text = text_after_french.upper()
        return final_text
    except:
        return ""

In [None]:
from bert_score import score
from rouge_score import rouge_scorer
import torch
import sacrebleu
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

validation = pd.read_csv('validation-titles.csv')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

r_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
chrf = sacrebleu.CHRF()

def validation_score_generate(validation, model_column, ref_column="fr_title", score_column="score", clean_output = lambda x: x):
    _, _, F1_base = score(validation[model_column].astype(str).apply(clean_output).tolist(), validation[ref_column].astype(str).tolist(), lang="fr", device=device, batch_size=300, verbose=True)
    c_scores_base = [chrf.sentence_score(translated, [ref]).score 
                 for translated, ref in zip(validation[model_column].astype(str).apply(clean_output), validation[ref_column].astype(str))]
    rouge_scores_base = [r_scorer.score(ref, hyp)['rougeL'].fmeasure*100
                     for ref, hyp in zip(validation[ref_column].astype(str), validation[model_column].astype(str).apply(clean_output))]
    validation[score_column] = [(0.2*c + 0.3*f*100 + 0.5*r).item()
                            for c, f, r in zip(c_scores_base, F1_base, rouge_scores_base)]
    return validation

def evaluate_translations_td_idf(df, model_column, ref_column="fr_title", score_column="score"):
    # Concatenate reference and translated texts for vectorization
    all_texts = df[ref_column].astype(str).tolist() + df[model_column].astype(str).tolist()
    
    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()

    # Vectorize the texts
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # Split the TF-IDF matrix into reference and model parts
    ref_tfidf = tfidf_matrix[:len(df)]
    model_tfidf = tfidf_matrix[len(df):]
    
    # Compute cosine similarity for each pair of reference/model texts
    similarities = []
    for ref_vec, model_vec in zip(ref_tfidf, model_tfidf):
        similarity = cosine_similarity(ref_vec, model_vec)[0][0]
        similarities.append(similarity)
    
    # Add similarity scores to the DataFrame
    df[score_column] = similarities
    return df

validation = validation_score_generate(validation, model_column="fr_translate_base", score_column="base_score")
validation = validation_score_generate(validation, model_column="fr_translate_5-shot", score_column="5-shot-score")
validation = validation_score_generate(validation, model_column="fr_translate_10-shot", score_column="10-shot-score")

print("1")
validation = evaluate_translations_td_idf(validation, model_column="fr_translate_base", score_column="tdidf-base_score")
print("2")
validation = evaluate_translations_td_idf(validation, model_column="fr_translate_5-shot", score_column="tdidf-5-shot-score")
print("3")
validation = evaluate_translations_td_idf(validation, model_column="fr_translate_10-shot", score_column="tdidf-10-shot-score")

In [None]:
score_ver = "tdidf-"
validation[validation[f"{score_ver}base_score"] != 0][f"{score_ver}base_score"].mean(),\
validation[validation[f"{score_ver}base_score"] != 0][f"{score_ver}base_score"].std(),\
validation[validation[f"{score_ver}5-shot-score"] != 0][f"{score_ver}5-shot-score"].mean(),\
validation[validation[f"{score_ver}5-shot-score"] != 0][f"{score_ver}5-shot-score"].std(),\
validation[validation[f"{score_ver}10-shot-score"] != 0][f"{score_ver}10-shot-score"].mean(),\
validation[validation[f"{score_ver}10-shot-score"] != 0][f"{score_ver}10-shot-score"].std()

In [None]:
score_ver = ""
validation[validation[f"{score_ver}5-shot-score"] <= 98][f"{score_ver}base_score"].mean(),\
validation[validation[f"{score_ver}5-shot-score"] <= 98][f"{score_ver}base_score"].std(),\
validation[validation[f"{score_ver}5-shot-score"] <= 98][f"{score_ver}5-shot-score"].mean(),\
validation[validation[f"{score_ver}5-shot-score"] <= 98][f"{score_ver}5-shot-score"].std(),\
validation[validation[f"{score_ver}5-shot-score"] <= 98][f"{score_ver}10-shot-score"].mean(),\
validation[validation[f"{score_ver}5-shot-score"] <= 98][f"{score_ver}10-shot-score"].std()

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_vecstore_similarity(english_title, model, k_neighbors=5):
    results = vec_store.similarity_search(query=english_title, k=k_neighbors)
    samples = [doc.metadata['en_title'] for doc in results]
    embeddings = np.array(model.encode(samples, batch_size=k_neighbors, show_progress_bar=True).tolist()).reshape(k_neighbors, 1, -1)
    embedding0 = np.array(model.encode([english_title], batch_size=k_neighbors, show_progress_bar=True).tolist()[0]).reshape(1, -1)
    similarities = pd.Series([cosine_similarity(embedding0, embedding)[0][0] for embedding in embeddings]).astype(float)
    return similarities.mean(), similarities.max(), similarities.min() 

# TODO: try other model embeddings (eg. document embeddings)
validation[['5-shot-sim-mean', '5-shot-sim-max', '5-shot-sim-min']] = validation['en_title'].apply(lambda title: pd.Series(get_vecstore_similarity(title, model)))

In [None]:
import plotly.express as px
import statsmodels.api as sm
import pandas as pd

# Create a dataframe
df = pd.DataFrame()

# Define variables for the plotting
df['Max DB Cosine Similarity'] = (validation[(validation['5-shot-sim-max'] < 0.99)]['5-shot-sim-max'] * 100)
df['Translation Score'] = (validation[(validation['5-shot-sim-max'] < 0.99)]['5-shot-score'])

# Perform OLS regression to get the p-value
X = sm.add_constant(df['Max DB Cosine Similarity'])  # Adds intercept term
model = sm.OLS(df['Translation Score'], X).fit()
p_value = model.pvalues['Max DB Cosine Similarity']

# Display the scatter plot with trendline
fig = px.scatter(
    df,
    x='Max DB Cosine Similarity',
    y='Translation Score',
    title=f'Translation Score vs. Database Example Similarity (p-value: {p_value:.3e})',
    height=650,
    width=750,
    trendline='ols',
    trendline_color_override='red'
)

# Show the plot
fig.show()

print("P-value for the line of best fit: {}".format(p_value))

In [None]:
from scipy.stats import pearsonr

pearsonr(df['x'], df['y'])

In [None]:
import pandas as pd
from bert_score import score
from rouge_score import rouge_scorer
import torch
import sacrebleu

device = 'cuda' if torch.cuda.is_available() else 'cpu'

gdata = pd.read_csv(r"google_translate.csv")

validation_df = validation[["en_title", "fr_title"]]
r_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
chrf = sacrebleu.CHRF()

def validation_score_generate(validation, model_column, ref_column="fr_title", score_column="score", clean_output = lambda x: x):
    _, _, F1_base = score(validation[model_column].apply(clean_output).tolist(), validation[ref_column].tolist(), lang="fr", device=device)
    c_scores_base = [chrf.sentence_score(translated, [ref]).score 
                 for translated, ref in zip(validation[model_column].apply(clean_output), validation[ref_column])]
    rouge_scores_base = [r_scorer.score(ref, hyp)['rougeL'].fmeasure*100
                     for ref, hyp in zip(validation[ref_column], validation[model_column].apply(clean_output))]
    validation[score_column] = [0.2*c + 0.3*f*100 + 0.5*r
                            for c, f, r in zip(c_scores_base, F1_base, rouge_scores_base)]
    print(c_scores_base[0])
    return validation

gdata = validation_score_generate(gdata, "fr_google_translate")
gdata["score"].astype(int).mean() 