### Model translation

In [25]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
  
tokenizer = AutoTokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")

model = AutoModelForSeq2SeqLM.from_pretrained("unicamp-dl/translation-pt-en-t5")

pten_pipeline = pipeline('text2text-generation', model=model, tokenizer=tokenizer)

pten_pipeline("translate Portuguese to English: Sabias que em Portugal apenas 14,4% dos profissionais na área de Tecnologias de Informação (TI)")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'generated_text': 'Knowledge that in Portugal only 14.4% of professionals in the'}]

### JSON to Embeddings

Relevant keys

    - episodes
        - description
        - name
        - duration (?)
        - date (?)

In [31]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [32]:
os.getenv("OPENAI_API_KEY")

'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'

In [33]:
from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Write a haiku about recursion in programming."
        }
    ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='Functions call themselves,  \nLooping through layers of code,  \nInfinite insights.', refusal=None, role='assistant', function_call=None, tool_calls=None)


In [50]:
def craft_translation_prompt(input_text):
    messages=[
        {
        "role": "system",
        "content": "You are a helpful assistant that is responsible for translating text from portuguese to english if the text is in portuguese. \
The text may contain named entities such as persons or companies so this should not be translated. You should also remove emojis or hyperlinks if they appear in the text. \
The response you give must contain only the processed text, no need for additional explanations."
        },
        {
        "role": "user",
        "content": f"Process and translate the following text: {input_text}"
        }]

    return messages

def chatgpt_inference(messages):
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )

    return completion.choices[0].message

In [None]:
import json

with open('/home/rsantos/workspace/personal/projects/noesis-podcast-rag/data/raw/metadata.json') as f:
    raw_data = json.load(f)

In [58]:
processed_data = {}

for i, episode_data in enumerate(raw_data['episodes']['items']):
    processed_episode = {}

    name_prompt = craft_translation_prompt(episode_data['name'])
    name_result = chatgpt_inference(name_prompt)
    processed_episode['name'] = name_result

    description_prompt = craft_translation_prompt(episode_data['description'])
    description_result = chatgpt_inference(description_prompt)
    processed_episode['description'] = description_result

    processed_data[i] = processed_episode


In [69]:
def additional_preprocessing(text):
    processed_text = text.replace("\n", " ")
    processed_text = " ".join(processed_text.split())
    return processed_text

for id, processed_episode in processed_data.items():
    processed_data[id]['name'] = additional_preprocessing(processed_data[id]['name'])
    processed_data[id]['description'] = additional_preprocessing(processed_data[id]['description'])

In [71]:
with open("/home/rsantos/workspace/personal/projects/noesis-podcast-rag/data/processed/episodes_data.json", "w") as fp:
    json.dump(processed_data, fp, ensure_ascii=False, indent=2)