# agent_preparation

In [None]:
!pip install pandas torch scikit-learn requests imms-log-by-format

import pandas as pd

import torch

from sklearn.model_selection import train_test_split

import requests

from io import StringIO

from imms_log_by_format import Logger



# Constants

DATASET_URL = 'https://artifactory.engine.capgemini.com/artifactory/IMMS-dataset-dev-local/auto_mpg_dataset.csv'

UPLOAD_URL = 'https://artifactory.engine.capgemini.com/artifactory/IMMS_datasets'

TOKEN = 'AKCpBtMeFndD5dudesorJSq64URz2WPtU3jfW7DqLwfDyD51vtneZkih6yNrFugBmxKgyFQ9q'

HEADERS = {'Authorization': f'Bearer {TOKEN}'}



# Logger setup

pipeline_name = 'regression_pipeline'

pipeline_id = '3'

pipeline_version = '1'

experiment_id = '12'

run_name = 'regression_pipeline_3_1_12_agent_preparation'

api_url = 'http://localhost:3290/bpfx/workspace/logs'



# Load dataset

try:

    response = requests.get(DATASET_URL, headers=HEADERS)

    response.raise_for_status()

    data = pd.read_csv(StringIO(response.text))

except requests.exceptions.RequestException as e:

    print(f'Error loading dataset: {e}')

    exit(1)



# Preprocess dataset

data = data.dropna()  # Drop rows with missing values

data = data.select_dtypes(include=[float, int])  # Drop categorical columns



# Split dataset

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)



# Convert datasets to CSV format in memory

train_csv = train_data.to_csv(index=False)

test_csv = test_data.to_csv(index=False)



# Upload datasets

def upload_dataset(csv_data, filename):

    try:

        response = requests.put(

            f'{UPLOAD_URL}/{filename}',

            headers=HEADERS,

            data=csv_data

        )

        response.raise_for_status()

    except requests.exceptions.RequestException as e:

        print(f'Error uploading {filename}: {e}')

        exit(1)



upload_dataset(train_csv, 'llmops_train_set.csv')

upload_dataset(test_csv, 'llmops_test_set.csv')

print('datasets pushed to the hub')



# Log the process

logger = Logger()

data = {

    'pipeline_name': pipeline_name,

    'pipeline_id': pipeline_id,

    'pipeline_version': pipeline_version,

    'experiment_id': experiment_id,

    'run_name': run_name,

    'status': 'completed'

}



try:

    logger.log_to_db(

        pipeline_name=pipeline_name,

        pipeline_id=pipeline_id,

        pipeline_version=pipeline_version,

        experiment_id=experiment_id,

        run_name=run_name,

        api_url=api_url,

        data=data

    )

except Exception as e:

    print(f'Error logging to database: {e}')



# agent_model_download

In [None]:
!pip install torch requests



import torch

import requests

import os

import time



# Step 1: Clear the CUDA cache

torch.cuda.empty_cache()



# Variables as per special instructions

pipeline_name = 'regression_pipeline'

pipeline_id = 3

pipeline_version = 1

experiment_id = 12

run_name = 'regression_pipeline_3_1_12_agent_model_download'

data = {

    'pipeline_name': pipeline_name,

    'pipeline_id': pipeline_id,

    'pipeline_version': pipeline_version,

    'experiment_id': experiment_id,

    'run_name': run_name

}



# Step 2: List the files in the folder

api_url = "https://artifactory.engine.capgemini.com/artifactory/api/storage/IMMS-model-dev-local/google/flant5-large?list&deep=1&listFolders=0"

headers = {

    'Authorization': 'Bearer AKCpBtMeFndD5dudesorJSq64URz2WPtU3jfW7DqLwfDyD51vtneZkih6yNrFugBmxKgyFQ9q'

}



response = requests.get(api_url, headers=headers)

response.raise_for_status()

file_list = response.json()['files']



# Step 3: Download all the listed files

download_dir = './t5-translation'

os.makedirs(download_dir, exist_ok=True)



start_time = time.time()



for file_info in file_list:

    file_path = file_info['uri']

    file_url = f"https://artifactory.engine.capgemini.com/artifactory/IMMS-model-dev-local/google/flant5-large{file_path}"

    file_name = os.path.basename(file_path)

    file_response = requests.get(file_url, headers=headers)

    file_response.raise_for_status()

    

    with open(os.path.join(download_dir, file_name), 'wb') as file:

        file.write(file_response.content)



# Step 4: Print the total time taken

end_time = time.time()

total_time = end_time - start_time

print(f"Total time taken: {total_time} seconds")



# agent_regression_finetuning

In [None]:
!pip install pandas scikit-learn requests

!pip install imms_log_by_format

import os

import requests

import pandas as pd

import pickle

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV

from imms_log_by_format import Logger



# Step 1: Download the datasets

token = 'AKCpBtMeFndD5dudesorJSq64URz2WPtU3jfW7DqLwfDyD51vtneZkih6yNrFugBmxKgyFQ9q'

headers = {'Authorization': f'Bearer {token}'}

train_url = 'https://artifactory.engine.capgemini.com/artifactory/IMMS_datasets/llmops_train_set.csv'

test_url = 'https://artifactory.engine.capgemini.com/artifactory/IMMS_datasets/llmops_test_set.csv'



train_response = requests.get(train_url, headers=headers)

test_response = requests.get(test_url, headers=headers)



with open('llmops_train_set.csv', 'wb') as f:

    f.write(train_response.content)



with open('llmops_test_set.csv', 'wb') as f:

    f.write(test_response.content)



# Step 2: Load the datasets

train_data = pd.read_csv('llmops_train_set.csv')

test_data = pd.read_csv('llmops_test_set.csv')



# Step 3: Initialize the model

model = GradientBoostingRegressor(n_estimators=10, learning_rate=0.1, max_depth=6)



# Step 4: Train the model

X_train = train_data.drop('mpg', axis=1)

y_train = train_data['mpg']

model.fit(X_train, y_train)



# Step 5: Perform a grid search to fine-tune the hyperparameters

param_grid = {

    'n_estimators': [10, 50, 100],

    'learning_rate': [0.01, 0.1, 0.2],

    'max_depth': [3, 6, 9],

    'random_state': [0, 42]

}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)



# Step 6: Display the best hyperparameters

best_params = grid_search.best_params_

print("Best hyperparameters found by grid search:", best_params)



# Step 7: Save the fine-tuned model

fine_tuned_model = grid_search.best_estimator_

os.makedirs('./results', exist_ok=True)

with open('./results/llmops_finetuned_model.pkl', 'wb') as f:

    pickle.dump(fine_tuned_model, f)



# Step 8: Log the results using the Logger class

pipeline_name = 'regression_pipeline'

pipeline_id = '3'

pipeline_version = '1'

experiment_id = '12'

run_name = 'regression_pipeline_3_1_12_agent_regression_finetuning'

api_url = 'http://localhost:3290/bpfx/workspace/logs'

data = {

    'best_params': best_params,

    'model_type': 'GradientBoostingRegressor'

}



logger = Logger()

try:

    logger.log_to_db(pipeline_name, pipeline_id, pipeline_version, experiment_id, run_name, api_url, data)

except Exception as e:

    print(f"Error logging to database: {e}")



# Step 9: Print the text 'model saved locally'

print('model saved locally')



# agent_model_evaluation

In [None]:
!pip install pandas requests scikit-learn transformers torch



import os

import pandas as pd

import requests

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from transformers import T5Tokenizer, T5ForConditionalGeneration

import torch

from imms_log_by_format import Logger

import numpy as np



# Step 1: Load the dataset

url = 'https://artifactory.engine.capgemini.com/artifactory/IMMS_datasets/llmops_test_set.csv'

token = 'AKCpBtMeFndD5dudesorJSq64URz2WPtU3jfW7DqLwfDyD51vtneZkih6yNrFugBmxKgyFQ9q'

headers = {'Authorization': f'Bearer {token}'}

response = requests.get(url, headers=headers)

response.raise_for_status()



with open('llmops_test_set.csv', 'wb') as f:

    f.write(response.content)



df = pd.read_csv('llmops_test_set.csv')



# Step 2: Load the pre-trained model

model_path = './translation'



if not os.path.exists(model_path):

    raise FileNotFoundError(f"The model path {model_path} does not exist. Please check the path and try again.")



try:

    tokenizer = T5Tokenizer.from_pretrained(model_path)

    model = T5ForConditionalGeneration.from_pretrained(model_path)

except Exception as e:

    raise EnvironmentError(f"Failed to load the model from {model_path}. Error: {e}")



# Step 3: Evaluate the model

X_test = df.drop(columns=['mpg'])

y_test = df['mpg']



# Convert the test data to string format

X_test_str = X_test.astype(str).apply(lambda x: ' '.join(x), axis=1)



# Tokenize the test data

inputs = tokenizer(X_test_str.tolist(), return_tensors='pt', padding=True, truncation=True)

outputs = model.generate(**inputs, max_new_tokens=50)

predictions = []



for output in outputs:

    decoded_output = tokenizer.decode(output, skip_special_tokens=True)

    try:

        # Extract the first numerical value from the generated sequence

        prediction = float(decoded_output.split()[0])

    except (ValueError, IndexError):

        prediction = float('nan')  # Handle cases where conversion fails or no tokens are generated

    predictions.append(prediction)



# Filter out NaN values

valid_indices = ~np.isnan(predictions)

y_test_valid = y_test[valid_indices]

predictions_valid = np.array(predictions)[valid_indices]



# Calculate metrics

mse = mean_squared_error(y_test_valid, predictions_valid)

mae = mean_absolute_error(y_test_valid, predictions_valid)

r2 = r2_score(y_test_valid, predictions_valid)



# Step 4: Make a prediction for the first row

first_row = X_test_str.iloc[0]

inputs = tokenizer(first_row, return_tensors='pt', padding=True, truncation=True)

outputs = model.generate(**inputs, max_new_tokens=50)

decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

try:

    first_row_prediction = float(decoded_output.split()[0])

except (ValueError, IndexError):

    first_row_prediction = float('nan')  # Handle cases where conversion fails or no tokens are generated



# Step 5: Log the results

pipeline_name = 'regression_pipeline'

pipeline_id = '3'

pipeline_version = '1'

experiment_id = '12'

run_name = 'regression_pipeline_3_1_12_agent_model_evaluation'

api_url = 'http://localhost:3290/bpfx/workspace/logs'

data = {

    'mean_squared_error': mse,

    'mean_absolute_error': mae,

    'r2_score': r2,

    'first_row_prediction': first_row_prediction

}



logger = Logger()

try:

    logger.log_to_db(pipeline_name, pipeline_id, pipeline_version, experiment_id, run_name, api_url, data)

    print(f"Model {model_path} that exists in local folder is evaluated for mean_squared_error, mean_absolute_error, r2_score.")

except Exception as e:

    print(f"Failed to log data: {e}")



# agent_deployment

In [None]:
!pip install requests

!pip install imms_log_by_format



In [None]:
import os

import pickle

import requests

from imms_log_by_format import Logger



# Step 1: Load the model

model_path = './results/llmops_finetuned_model.pkl'

with open(model_path, 'rb') as file:

    model = pickle.load(file)



# Step 2: Upload the model and config file

upload_url = 'https://artifactory.engine.capgemini.com/artifactory/IMMS-test/bert/'

token = 'AKCpBtMeFndD5dudesorJSq64URz2WPtU3jfW7DqLwfDyD51vtneZkih6yNrFugBmxKgyFQ9q'

headers = {'Authorization': f'Bearer {token}'}



# Upload model

model_file = {'file': open(model_path, 'rb')}

response_model = requests.put(upload_url + 'llmops_finetuned_model.pkl', files=model_file, headers=headers)

model_file['file'].close()



# Check if the model upload was successful

if response_model.status_code == 201:

    print("Model uploaded successfully.")

else:

    print(f"Failed to upload model. Status code: {response_model.status_code}")

    print(response_model.text)



# Check if config file exists

config_path = './results/config.json'

if os.path.exists(config_path):

    # Upload config file

    config_file = {'file': open(config_path, 'rb')}

    response_config = requests.put(upload_url + 'config.json', files=config_file, headers=headers)

    config_file['file'].close()



    # Check if the config upload was successful

    if response_config.status_code == 201:

        print("Config file uploaded successfully.")

    else:

        print(f"Failed to upload config file. Status code: {response_config.status_code}")

        print(response_config.text)

else:

    print("Config file not found.")

    response_config = None



# Step 3: Log the details using Logger

pipeline_name = 'regression_pipeline'

pipeline_id = '3'

pipeline_version = '1'

experiment_id = '12'

run_name = 'regression_pipeline_3_1_12_agent_deployment'

api_url = 'http://localhost:3290/bpfx/workspace/logs'

data = {

    'model_upload_status': response_model.status_code,

    'config_upload_status': response_config.status_code if response_config else 'File not found'

}



logger = Logger()

try:

    logger.log_to_db(pipeline_name, pipeline_id, pipeline_version, experiment_id, run_name, api_url, data)

    print("Logging successful.")

except Exception as e:

    print(f"Failed to log data: {e}")

