# translation_agent_preparation

In [None]:
import torch

import requests

import pandas as pd

from sklearn.model_selection import train_test_split

from io import StringIO

import time

from imms_log_by_format import Logger



# Step 1: Clear the CUDA cache

torch.cuda.empty_cache()



# Step 2: Load the dataset

url = 'https://artifactory.engine.capgemini.com/artifactory/IMMS-dataset-dev-local/wmt_100.csv'

token = 'AKCpBtMeFndD5dudesorJSq64URz2WPtU3jfW7DqLwfDyD51vtneZkih6yNrFugBmxKgyFQ9q'

headers = {'Authorization': f'Bearer {token}'}

response = requests.get(url, headers=headers)

response.raise_for_status()  # Raise an error if the request failed



# Step 3: Read the dataset into a pandas DataFrame

data = pd.read_csv(StringIO(response.text))



# Step 4: Split the data into training and testing sets

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)



# Step 5: Preprocess the dataset by handling missing values

train_data.fillna(method='ffill', inplace=True)

test_data.fillna(method='ffill', inplace=True)



# Convert DataFrames to CSV format without saving to disk

train_csv = train_data.to_csv(index=False)

test_csv = test_data.to_csv(index=False)



# Step 6: Upload the datasets

upload_url = 'https://artifactory.engine.capgemini.com/artifactory/IMMS-dataset-dev-local'

# Upload train dataset

train_response = requests.put(f'{upload_url}/llmops_train_set.csv', headers=headers, data=train_csv)

train_response.raise_for_status()  # Raise an error if the upload failed



# Upload test dataset

test_response = requests.put(f'{upload_url}/llmops_test_set.csv', headers=headers, data=test_csv)

test_response.raise_for_status()  # Raise an error if the upload failed



# Step 7: Install required libraries

import subprocess

subprocess.run(['pip', 'install', 'transformers[torch]', 'SentencePiece', 'rouge_score'], check=True)



# Step 8: Log the process

pipeline_name = 'genie_translation'

pipeline_id = '1'

pipeline_version = '1'

experiment_id = '9'

run_name = 'genie_translation_1_1_9_translation_agent_preparation'

api_url = 'http://localhost:3290/bpfx/workspace/logs'

data = {

    'train_dataset': 'llmops_train_set.csv',

    'test_dataset': 'llmops_test_set.csv',

    'status': 'uploaded'

}



logger = Logger()

logger.log_to_db(pipeline_name, pipeline_id, pipeline_version, experiment_id, run_name, api_url, data)



# Step 9: Print the completion message and total time taken

start_time = time.time()

print('datasets pushed to the hub')

end_time = time.time()

print(f'Total time taken: {end_time - start_time} seconds')



# agent_model_download

In [None]:
!pip install torch requests

import torch

import requests

import os

import time



# Step 1: Clear the CUDA cache

torch.cuda.empty_cache()



# Step 2: List the files in the specified folder

api_url = "https://artifactory.engine.capgemini.com/artifactory/api/storage/IMMS-model-dev-local/google/flant5-large?list&deep=1&listFolders=0"

token = "AKCpBtMeFndD5dudesorJSq64URz2WPtU3jfW7DqLwfDyD51vtneZkih6yNrFugBmxKgyFQ9q"

headers = {"Authorization": f"Bearer {token}"}



response = requests.get(api_url, headers=headers)

response.raise_for_status()

file_list = response.json()['files']



# Step 3: Download all the listed files and save them in ./t5-translation

os.makedirs('./t5-translation', exist_ok=True)



start_time = time.time()



for file_info in file_list:

    file_path = file_info['uri']

    file_url = f"https://artifactory.engine.capgemini.com/artifactory/IMMS-model-dev-local/google/flant5-large{file_path}"

    file_name = os.path.basename(file_path)

    file_response = requests.get(file_url, headers=headers)

    file_response.raise_for_status()

    

    with open(f'./t5-translation/{file_name}', 'wb') as file:

        file.write(file_response.content)



end_time = time.time()



# Step 4: Print the total time taken to execute the complete code

total_time = end_time - start_time

print(f"Total time taken: {total_time} seconds")



# Special instructions: Create and assign variables

pipeline_name = "genie_translation"

pipeline_id = 1

pipeline_version = 1

experiment_id = 9

run_name = "genie_translation_1_1_9_agent_model_download"

data = {

    "pipeline_name": pipeline_name,

    "pipeline_id": pipeline_id,

    "pipeline_version": pipeline_version,

    "experiment_id": experiment_id,

    "run_name": run_name

}



print("Pipeline data:", data)



# translation_agent_finetuning

In [None]:
!pip install torch deepspeed transformers datasets pandas requests

!pip install imms_log_by_format



In [None]:
import os

import torch

import deepspeed

import requests

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments

from datasets import Dataset

import pandas as pd

from imms_log_by_format import Logger



# Step 1: Clear the CUDA cache

torch.cuda.empty_cache()



# Step 2: Initialize distributed training with DeepSpeed

deepspeed.init_distributed()



# Step 3: Download the datasets

def download_dataset(url, token, filename):

    headers = {'Authorization': f'Bearer {token}'}

    response = requests.get(url, headers=headers)

    response.raise_for_status()

    with open(filename, 'wb') as f:

        f.write(response.content)



dataset_urls = {

    'train': 'https://artifactory.engine.capgemini.com/artifactory/IMMS-dataset-dev-local/llmops_train_set.csv',

    'test': 'https://artifactory.engine.capgemini.com/artifactory/IMMS-dataset-dev-local/llmops_test_set.csv'

}

token = 'AKCpBtMeFndD5dudesorJSq64URz2WPtU3jfW7DqLwfDyD51vtneZkih6yNrFugBmxKgyFQ9q'



for name, url in dataset_urls.items():

    download_dataset(url, token, f'{name}_set.csv')



# Step 4: Load the model for fine-tuning

model_name = './t5-translation'

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



# Step 5: Tokenize the input sequences and labels

def tokenize_function(examples):

    inputs = tokenizer(examples['en'], padding="max_length", truncation=True)

    targets = tokenizer(examples['fr'], padding="max_length", truncation=True)

    inputs['labels'] = targets['input_ids']

    return inputs



train_data = pd.read_csv('train_set.csv')

test_data = pd.read_csv('test_set.csv')



train_dataset = Dataset.from_pandas(train_data)

test_dataset = Dataset.from_pandas(test_data)



train_dataset = train_dataset.map(tokenize_function, batched=True)

test_dataset = test_dataset.map(tokenize_function, batched=True)



# Step 6: Ensure proper alignment between input data and labels

# This is handled by the tokenize_function



# Step 7: Configure DeepSpeed's zero optimization

ds_config = {

    "train_batch_size": "auto",

    "gradient_accumulation_steps": 1,

    "fp16": {

        "enabled": "auto"

    },

    "zero_optimization": {

        "stage": 2,

        "offload_optimizer": {

            "device": "cpu",

            "pin_memory": True

        },

        "allgather_partitions": True,

        "allgather_bucket_size": 2e8,

        "reduce_scatter": True,

        "reduce_bucket_size": 2e8,

        "overlap_comm": True,

        "contiguous_gradients": True

    }

}



# Step 8: Handle model sharding if necessary

# This is handled by DeepSpeed's zero optimization configuration



# Step 9: Track the loss function and metrics

training_args = TrainingArguments(

    output_dir='./results',

    per_device_train_batch_size=8,

    per_device_eval_batch_size=8,

    num_train_epochs=3,

    logging_dir='./logs',

    logging_steps=10,

    deepspeed=ds_config

)



trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=train_dataset,

    eval_dataset=test_dataset

)



# Step 10: Save the fine-tuned model and tokenizer

trainer.train()

model.save_pretrained('./translation')

tokenizer.save_pretrained('./translation')



# Step 11: Implement logging with error handling

pipeline_name = 'genie_translation'

pipeline_id = '1'

pipeline_version = '1'

experiment_id = '9'

run_name = 'genie_translation_1_1_9_translation_agent_finetuning'

api_url = 'http://localhost:3290/bpfx/workspace/logs'

data = {

    'status': 'completed',

    'message': 'Fine-tuning completed successfully'

}



logger = Logger()

try:

    logger.log_to_db(pipeline_name, pipeline_id, pipeline_version, experiment_id, run_name, api_url, data)

except Exception as e:

    print(f"Logging failed: {e}")



# translation_agent_evaluation

In [None]:
!pip install torch datasets transformers pandas evaluate

!pip install imms_log_by_format



In [None]:
import torch

from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from imms_log_by_format import Logger

import pandas as pd

import evaluate



# Step 1: Clear the CUDA cache

torch.cuda.empty_cache()



# Step 2: Load the dataset

dataset_path = "llmops_test_set.csv"

dataset = load_dataset('csv', data_files=dataset_path, split='train')



# Step 3: Tokenize the dataset

model_name = "./translation"

tokenizer = AutoTokenizer.from_pretrained(model_name)



def tokenize_function(examples):

    return tokenizer(examples['en'], padding="max_length", truncation=True)



tokenized_dataset = dataset.map(tokenize_function, batched=True)



# Step 4: Load the fine-tuned model and tokenizer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to('cuda')



# Step 5: Generate predictions

def generate_predictions(batch):

    inputs = tokenizer(batch['en'], return_tensors='pt', padding="max_length", truncation=True).to('cuda')

    outputs = model.generate(inputs['input_ids'], max_length=50, num_beams=5)

    batch['predicted'] = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return batch



predictions = tokenized_dataset.map(generate_predictions, batched=True, batch_size=8)



# Step 6: Prefix the evaluation prompt

predictions = predictions.map(lambda x: {"en": "Translate from en to fr: " + x['en']})



# Step 7: Store the predicted and actual values in a CSV file

predictions_df = pd.DataFrame(predictions)

predictions_df.to_csv('predictions.csv', index=False)



# Step 8: Calculate evaluation metrics

bleu = evaluate.load('bleu')

rouge = evaluate.load('rouge')



references = [ref for ref in predictions['fr']]

predictions_list = [pred for pred in predictions['predicted']]



bleu_score = bleu.compute(predictions=predictions_list, references=references)

rouge_score = rouge.compute(predictions=predictions_list, references=references)



# Step 9: Log the evaluation results

pipeline_name = "genie_translation"

pipeline_id = "1"

pipeline_version = "1"

experiment_id = "9"

run_name = "genie_translation_1_1_9_translation_agent_evaluation"

api_url = "http://localhost:3290/bpfx/workspace/logs"



data = {

    "BLEU": bleu_score,

    "ROUGE": rouge_score

}



logger = Logger()

try:

    logger.log_to_db(pipeline_name, pipeline_id, pipeline_version, experiment_id, run_name, api_url, data)

except Exception as e:

    print(f"Logging failed: {e}")



# Step 10: Print both the actual and predicted results

print(predictions_df[['en', 'fr', 'predicted']])



# translation_agent_deployment

In [None]:
!pip install requests



import os

import requests

import time



# Set up the necessary variables

pipeline_name = 'genie_translation'

pipeline_id = 1

pipeline_version = 1

experiment_id = 9

run_name = 'genie_translation_1_1_9_translation_agent_deployment'



data = {

    'pipeline_name': pipeline_name,

    'pipeline_id': pipeline_id,

    'pipeline_version': pipeline_version,

    'experiment_id': experiment_id,

    'run_name': run_name

}



# URL and token

url = 'https://artifactory.engine.capgemini.com/artifactory/IMMS-model-dev-local/google/flant5-large-v1'

token = 'AKCpBtMeFndD5dudesorJSq64URz2WPtU3jfW7DqLwfDyD51vtneZkih6yNrFugBmxKgyFQ9q'



# Directory containing the model files

directory = './translation'



# Start the timer

start_time = time.time()



# Iterate through the files in the directory and upload each one

for filename in os.listdir(directory):

    file_path = os.path.join(directory, filename)

    if os.path.isfile(file_path):

        with open(file_path, 'rb') as f:

            response = requests.put(

                f"{url}/{filename}",

                headers={'Authorization': f'Bearer {token}'},

                data=f

            )

            if response.status_code == 201:

                print(f"Successfully uploaded {filename}")

            else:

                print(f"Failed to upload {filename}. Status code: {response.status_code}")



# End the timer

end_time = time.time()



# Calculate the total time taken

total_time = end_time - start_time



# Print the model uploaded to hub and the total time taken

print(f"Model uploaded to hub: {url}")

print(f"Total time taken: {total_time} seconds")

