## PLS example

The FHI project uses Partial least squares regression (PLS regression) to try and predict drug concentrations.
It uses image segmentation for the data and Works with this R project to generate PLSR coefficients https://github.com/sortijas/PAD . 

In this example we will download images from the ```FHI2022``` project and calculate their concentrations.

The workflow is
1. Grab some PLS coefficients. These can be generated by the R program or downloaded from the PADs website.
1. Download and loop through the images and run the image segmentation/PLS prediction code.

In [1]:
import sys, os

sys.path.append(os.path.abspath('../src'))

import pad_helper 

# set url for pad server
pad_url = 'https://pad.crc.nd.edu/'

# you need to ask Chris nicely for an API key
API_KEY = '5NWT4K7IS60WMLR3J2LV'

In [2]:
# Load the data into pandas if successful
import pandas as pd

# lets grab some data, in the example we download all records for project "FHI2022"
my_data = pad_helper.query_pad_database("FHI2020", API_KEY)

# check if we were succesful
if my_data and 'status' in my_data:
    if my_data['status'] == 'ko':
        print("Error:", my_data['error_description'])
    else:
        # if succesful we can create a pandas table
        if 'data' in my_data:
            df = pd.DataFrame(my_data['data'])
            if 'headers' in my_data:
                df.columns = my_data['headers']
            print("Data loaded!")
            #print(my_data)
        else:
            print("Empty query!")


Data loaded!


In [3]:
def standardize_names(name):
    return name.lower().replace(' ', '-')
    
df['sample_name'] = df['sample_name'].apply(standardize_names)    

In [4]:
test_df = pd.read_csv('../data/10_region_rgb__test.csv')

# Filtering results_pls by the 'id' column based on the values present in the 'image' column of test_df
df_test = df[df['id'].isin(test_df['Image'])]
df_test

Unnamed: 0,id,sample_name,test_name,user_name,date_of_creation,picture_1_location,processed_file_location,processing_date,camera_type_1,notes,category,sample_id,quantity
18,15229,amoxicillin,12LanePADKenya2015,Marya Lieberman,2020-05-18T09:50:09,/var/www/html/images/padimages/raw/10000/15229...,/var/www/html/images/padimages/processed/10000...,2020-05-18T09:50:09,Google Pixel 3a,"emailed rectified pad image. batch=n/a, quanti...",FHI2020,53848,100
22,15233,amoxicillin,12LanePADKenya2015,Marya Lieberman,2020-05-18T09:50:19,/var/www/html/images/padimages/raw/10000/15233...,/var/www/html/images/padimages/processed/10000...,2020-05-18T09:50:19,Google Pixel 3a,"emailed rectified pad image. batch=n/a, quanti...",FHI2020,53697,100
25,15236,amoxicillin,12LanePADKenya2015,Marya Lieberman,2020-05-18T09:50:25,/var/www/html/images/padimages/raw/10000/15236...,/var/www/html/images/padimages/processed/10000...,2020-05-18T09:50:25,Google Pixel 3a,"emailed rectified pad image. batch=n/a, quanti...",FHI2020,53703,100
27,15238,amoxicillin,12LanePADKenya2015,Marya Lieberman,2020-05-18T09:50:30,/var/www/html/images/padimages/raw/10000/15238...,/var/www/html/images/padimages/processed/10000...,2020-05-18T09:50:30,Google Pixel 3a,"emailed rectified pad image. batch=n/a, quanti...",FHI2020,53703,100
37,15253,amoxicillin,12LanePADKenya2015,Marya Lieberman,2020-05-18T10:40:18,/var/www/html/images/padimages/raw/10000/15253...,/var/www/html/images/padimages/processed/10000...,2020-05-18T10:40:18,Google Pixel 3a,"emailed rectified pad image. batch=n/a, quanti...",FHI2020,53712,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9669,25637,ripe,12LanePADKenya2015,api-OUDPXFH17PEGKUW3FM4Z,2020-07-10T15:15:03,/var/www/html/images/padimages/raw/20000/25637...,/var/www/html/images/padimages/processed/20000...,2020-07-10T15:15:03,samsung SM-A505F,"batch=n/a, s",FHI2020,55075,50
9670,25638,ripe,12LanePADKenya2015,api-OUDPXFH17PEGKUW3FM4Z,2020-07-10T15:15:08,/var/www/html/images/padimages/raw/20000/25638...,/var/www/html/images/padimages/processed/20000...,2020-07-10T15:15:08,samsung SM-A505F,"batch=n/a, s s",FHI2020,55492,20
9674,25642,ripe,12LanePADKenya2015,api-OUDPXFH17PEGKUW3FM4Z,2020-07-10T15:15:28,/var/www/html/images/padimages/raw/20000/25642...,/var/www/html/images/padimages/processed/20000...,2020-07-10T15:15:28,samsung SM-A505F,"batch=n/a, s s",FHI2020,55075,50
9678,25646,ripe,12LanePADKenya2015,api-OUDPXFH17PEGKUW3FM4Z,2020-07-10T15:15:47,/var/www/html/images/padimages/raw/20000/25646...,/var/www/html/images/padimages/processed/20000...,2020-07-10T15:15:47,samsung SM-A505F,"batch=n/a, s s",FHI2020,55432,80


In [5]:
# Creating the 'results_pls' dataframe by selecting relevant columns from 'df' and adding 'pred_quantity'
results_pls = pd.DataFrame({
    'id': df_test['id'],
    'sample_id': df_test['sample_id'],
    'actual_class': df_test['sample_name'],  # Renaming 'sample_name' to 'actual_class'
    'actual_quantity': df_test['quantity'],  # Renaming 'quantity' to 'actual_quantity'
    'pred_quantity': [None] * len(df_test)   # Initializing 'pred_quantity' with None values
})


results_pls

Unnamed: 0,id,sample_id,actual_class,actual_quantity,pred_quantity
18,15229,53848,amoxicillin,100,
22,15233,53697,amoxicillin,100,
25,15236,53703,amoxicillin,100,
27,15238,53703,amoxicillin,100,
37,15253,53712,amoxicillin,80,
...,...,...,...,...,...
9669,25637,55075,ripe,50,
9670,25638,55492,ripe,20,
9674,25642,55075,ripe,50,
9678,25646,55432,ripe,80,


In [6]:
# STAGE 1

# Lets calculate some PLS values
# first we need some coefficients
# If you need some run this to grab from server:
pls_url = 'https://pad.crc.nd.edu/neuralnetworks/pls/24fhiPLS1quantity/1.0/24fhiPLS1quantity.csv'

# call helper function
if pad_helper.pad_download(pls_url):
    print(pls_url, "downloaded.")

https://pad.crc.nd.edu/neuralnetworks/pls/24fhiPLS1quantity/1.0/24fhiPLS1quantity.csv downloaded.


In [7]:
import csv
import cv2 as cv
import numpy as np
import urllib3
from PIL import Image, ImageFile
import regionRoutine

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
ImageFile.LOAD_TRUNCATED_IMAGES = True

def download_file(url, filename, images_path):
    """Download a file from a URL and save it to a local file."""
    try:
        response = requests.get(url, stream=True, verify=False)
        if response.status_code == 200:
            path = os.path.join(images_path, filename)
            with open(path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            # print(f"File '{filename}' successfully downloaded to '{images_path}'")
        else:
            # Log error if the response status code is not 200
            print(f"Failed to download the file. URL: {url} returned status code: {response.status_code}")
            raise Exception(f"Failed to download the file. URL: {url} returned status code: {response.status_code}")
    except Exception as e:
        # Log any other exceptions during the download process
        print(f"An error occurred while downloading the file: {e}")
        # Optionally, you can re-raise the exception if you want it to be noticed by the calling function
        raise

def convert_from_image_to_cv2(img: Image) -> np.ndarray:
    # return np.asarray(img)
    return cv.cvtColor(np.array(img), cv.COLOR_RGB2BGR)

class pls:
    def __init__(self, coefficients_file):
        try:
            # load coeffs
            self.coeff = {}
            with open(coefficients_file) as csvcoeffs:
                csvcoeffreader = csv.reader(csvcoeffs)
                #i=0
                for row in csvcoeffreader:
                    elmts = []
                    for j in range(1,len(row)):
                        elmts.append(float(row[j]))
                    self.coeff[row[0]] = elmts
        except Exception as e:
            print("Error",e, "loading pls coefficients", coefficients_file)

    def quantity(self, in_file, drug):
        try:
            # grab image
            img = cv.imread(in_file)
            
            if img is None:
                print("Converting img.. ", in_file)
                # read image using Pillow and covert to cv2
                img_pil = Image.open(in_file)
                img = convert_from_image_to_cv2(img_pil)
            
            if img is None:
                raise Exception(f"Failed to load the file. URL: {in_file}.") 

            # pls dictionary
            f = {}
            f = regionRoutine.fullRoutine(img, regionRoutine.intFind.findMaxIntensitiesFiltered, f, True, 10)

            # drug?
            # continue if no coefficients
            
            if drug.lower() not in self.coeff:
                print(drug.lower(), "--- NOT IN COEFFICIENTS FILE ---")
                return -1
            
            

            drug_coeff = self.coeff[drug.lower()] #coeff['amoxicillin'] #

            # start with offst
            pls_concentration = drug_coeff[0]

            coeff_index = 1

            for letter in ['A','B','C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']:
                for region in range(10):
                    for color_letter in ['R', 'G', 'B']:
                        pixval = f[letter + str(region + 1) + '-' + color_letter]
                        pls_concentration += float(pixval) * drug_coeff[coeff_index]
                        coeff_index += 1
                        
            # print(drug.lower(), "--- OK ---")
            return pls_concentration
        
        except Exception as e:
            print("Error",e, "pls analyzing image", in_file, "with", drug)
            return -1.
        
        
# for api in sample_names:
#     if api.lower() in pls_conc.coeff:
#         print(api, pls_conc.coeff[api.lower()])
#     else:
#         print(api.lower(), 0.0)


# def fix_img(name, img_dir):
#     img_path = os.path.join(img_dir, name)
#     img_pil = Image.open(img_path)
#     img_pil.save(img_path)     

In [8]:
# STAGE 2
import os, requests

# Create a PLS class instance
import pad_analysis

# name 
pls_file = os.path.basename(pls_url)
print(pls_file)

# creat pls instance
pls_conc = pls(pls_file)

2024-09-30 08:38:09.927632: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-30 08:38:09.977308: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-30 08:38:09.977348: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-30 08:38:09.977378: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-30 08:38:09.986762: I tensorflow/core/platform/cpu_feature_g

24fhiPLS1quantity.csv


In [9]:
# # loop over rows and get the first 10 to predict the concentration

# # define temporary file
# temp_file = './temp.png'

# # count the rows so we can break early
# row_count = 0

# # for pandas we iterate with iterrows
# # add tqdm for a progress bar
# from tqdm import tqdm
# for row in df_test.iterrows():
# # for row in df_test.iterrows():
    
#     # update counter
#     row_count += 1

#     # the data is in the second component of the row for pandas
#     row_data = row[1]
    
#     # print(row[1]['id'], row_data['sample_id'], row_data['sample_name'], 'https://pad.crc.nd.edu/' + row_data['processed_file_location'])
    
#     # get image location
#     image_url = pad_url + str(row_data['processed_file_location'])
    
#     # Download it to a temporary file
#     # if not pad_helper.pad_download(image_url, temp_file):
#     #     print("Error downloading image", image_url)
#     #     break
#     download_file(image_url, temp_file, './')
    
#     # fix truncated images
    
        
#     # analyze it
#     pls_concentration = pls_conc.quantity(temp_file, row_data['sample_name'])
    
#     # print results
#     print("PAD",  row[1]['id'], row_data['sample_id'], row_data['sample_name'], "PLS concentration", \
#         "{:.1f}".format(pls_concentration), "%, actual", row_data['quantity'])
    
#     results_pls.loc[row_count-1, 'pred_quantity']  = pls_concentration
    
#     # break once we have out 10 data points
#     # if row_count >= 10:
#     #     break

In [13]:
import pandas as pd
from tqdm import tqdm

# Assume df_test is already defined as a pandas dataframe
# Assume download_file and pls_conc.quantity are predefined functions

# Define temporary file and output file for intermediate values
temp_file = './temp.png'
intermediate_file = '../data/pls_intermediate_results.csv'


# Open the intermediate file to write intermediate results
with open(intermediate_file, 'w') as f:
    # Write header row for the intermediate CSV
    f.write('count,index,id,sample_id,sample_name,pred_quantity,actual_quantity\n')

    # Count the rows to limit the loop
    row_count = 0

    # Loop over rows with progress bar
    for index, row in tqdm(df_test.iterrows(), total=len(df_test)):
        # Update counter
        row_count += 1

        # The data is in the second component of the row for pandas
        row_data = row

        # Get image location
        image_url = pad_url + str(row_data['processed_file_location'])

        # Download it to a temporary file
        download_file(image_url, temp_file, './')

        # Analyze it to get predicted concentration
        pls_concentration = pls_conc.quantity(temp_file, row_data['sample_name'])

        # Print results (optional)
        # print("PAD", row_data['id'], row_data['sample_id'], row_data['sample_name'], 
        #       "PLS concentration", "{:.1f}".format(pls_concentration), "%, actual", row_data['quantity'])

        # Save intermediate values to the file
        f.write(f"{row_count-1},{index},{row_data['id']},{row_data['sample_id']},{row_data['sample_name']},{pls_concentration},{row_data['quantity']}\n")

        # Flush to ensure the data is saved to the file
        f.flush()
        
        # # Break once we have our 10 data points (if required)
        # if row_count >= 3:
        #     break



 22%|██▏       | 441/2000 [04:05<14:25,  1.80it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 22%|██▏       | 442/2000 [04:06<14:33,  1.78it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 22%|██▏       | 443/2000 [04:06<14:31,  1.79it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 22%|██▏       | 444/2000 [04:07<14:31,  1.78it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 22%|██▏       | 445/2000 [04:07<14:25,  1.80it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 22%|██▏       | 446/2000 [04:08<14:27,  1.79it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 22%|██▏       | 447/2000 [04:09<14:25,  1.79it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 22%|██▏       | 448/2000 [04:09<14:28,  1.79it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 22%|██▏       | 449/2000 [04:10<14:27,  1.79it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 22%|██▎       | 450/2000 [04:10<14:30,  1.78it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 23%|██▎       | 451/2000 [04:11<14:26,  1.79it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 23%|██▎       | 452/2000 [04:11<14:19,  1.80it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 23%|██▎       | 453/2000 [04:12<14:24,  1.79it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 23%|██▎       | 454/2000 [04:13<14:27,  1.78it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 23%|██▎       | 455/2000 [04:13<14:21,  1.79it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 23%|██▎       | 456/2000 [04:14<14:20,  1.80it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 23%|██▎       | 457/2000 [04:14<14:18,  1.80it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 23%|██▎       | 458/2000 [04:15<14:17,  1.80it/s]

lactose --- NOT IN COEFFICIENTS FILE ---


 34%|███▎      | 670/2000 [06:14<12:11,  1.82it/s]libpng error: Read Error


Converting img..  ./temp.png


 39%|███▉      | 776/2000 [07:12<11:07,  1.83it/s]libpng error: Read Error


Converting img..  ./temp.png


 39%|███▉      | 783/2000 [07:16<12:04,  1.68it/s]libpng error: Read Error


Converting img..  ./temp.png


 39%|███▉      | 786/2000 [07:18<11:35,  1.75it/s]libpng error: Read Error


Converting img..  ./temp.png


 40%|███▉      | 790/2000 [07:20<11:10,  1.81it/s]libpng error: Read Error


Converting img..  ./temp.png


 59%|█████▉    | 1177/2000 [10:59<07:31,  1.82it/s]libpng error: Read Error


Converting img..  ./temp.png


 59%|█████▉    | 1189/2000 [11:06<07:21,  1.84it/s]libpng error: Read Error


Converting img..  ./temp.png


 64%|██████▍   | 1278/2000 [11:55<06:47,  1.77it/s]libpng error: Read Error


Converting img..  ./temp.png


 64%|██████▍   | 1280/2000 [11:56<06:45,  1.78it/s]libpng error: Read Error


Converting img..  ./temp.png


 64%|██████▍   | 1282/2000 [11:57<06:47,  1.76it/s]libpng error: Read Error


Converting img..  ./temp.png


 64%|██████▍   | 1285/2000 [11:58<06:35,  1.81it/s]libpng error: Read Error


Converting img..  ./temp.png


 64%|██████▍   | 1288/2000 [12:00<06:34,  1.80it/s]libpng error: Read Error


Converting img..  ./temp.png


 64%|██████▍   | 1289/2000 [12:01<06:38,  1.79it/s]libpng error: Read Error


Converting img..  ./temp.png


 64%|██████▍   | 1290/2000 [12:01<06:40,  1.77it/s]libpng error: Read Error


Converting img..  ./temp.png


 65%|██████▍   | 1294/2000 [12:03<06:27,  1.82it/s]libpng error: Read Error


Converting img..  ./temp.png


 65%|██████▍   | 1298/2000 [12:06<06:50,  1.71it/s]libpng error: Read Error


Converting img..  ./temp.png


 65%|██████▌   | 1303/2000 [12:09<06:26,  1.80it/s]libpng error: Read Error


Converting img..  ./temp.png


 65%|██████▌   | 1304/2000 [12:09<06:29,  1.79it/s]libpng error: Read Error


Converting img..  ./temp.png


 67%|██████▋   | 1336/2000 [12:27<06:11,  1.79it/s]libpng error: Read Error


Converting img..  ./temp.png


 67%|██████▋   | 1337/2000 [12:28<06:13,  1.77it/s]libpng error: Read Error


Converting img..  ./temp.png


 67%|██████▋   | 1345/2000 [12:32<06:03,  1.80it/s]libpng error: Read Error


Converting img..  ./temp.png


 68%|██████▊   | 1350/2000 [12:35<06:05,  1.78it/s]libpng error: Read Error


Converting img..  ./temp.png


 68%|██████▊   | 1360/2000 [12:41<05:58,  1.79it/s]libpng error: Read Error


Converting img..  ./temp.png


 68%|██████▊   | 1362/2000 [12:42<05:56,  1.79it/s]libpng error: Read Error


Converting img..  ./temp.png


 69%|██████▉   | 1381/2000 [12:52<05:42,  1.81it/s]libpng error: Read Error


Converting img..  ./temp.png


 72%|███████▏  | 1449/2000 [13:30<05:04,  1.81it/s]libpng error: Read Error


Converting img..  ./temp.png


 73%|███████▎  | 1454/2000 [13:33<05:07,  1.77it/s]libpng error: Read Error


Converting img..  ./temp.png


 73%|███████▎  | 1459/2000 [13:35<05:05,  1.77it/s]libpng error: Read Error


Converting img..  ./temp.png


 73%|███████▎  | 1462/2000 [13:37<05:03,  1.77it/s]libpng error: Read Error


Converting img..  ./temp.png


 73%|███████▎  | 1463/2000 [13:38<05:05,  1.76it/s]libpng error: Read Error


Converting img..  ./temp.png


 74%|███████▎  | 1474/2000 [13:44<04:51,  1.80it/s]libpng error: Read Error


Converting img..  ./temp.png


 74%|███████▍  | 1477/2000 [13:46<04:52,  1.79it/s]libpng error: Read Error


Converting img..  ./temp.png


 74%|███████▍  | 1479/2000 [13:47<04:53,  1.77it/s]libpng error: Read Error


Converting img..  ./temp.png


 75%|███████▍  | 1491/2000 [13:53<04:42,  1.80it/s]libpng error: Read Error


Converting img..  ./temp.png


 77%|███████▋  | 1536/2000 [14:18<04:15,  1.82it/s]libpng error: Read Error


Converting img..  ./temp.png


 78%|███████▊  | 1557/2000 [14:30<04:07,  1.79it/s]libpng error: Read Error


Converting img..  ./temp.png


 80%|████████  | 1605/2000 [14:57<03:40,  1.79it/s]libpng error: Read Error


Converting img..  ./temp.png


 81%|████████  | 1611/2000 [15:00<03:36,  1.80it/s]libpng error: Read Error


Converting img..  ./temp.png


 81%|████████  | 1615/2000 [15:02<03:38,  1.76it/s]libpng error: Read Error


Converting img..  ./temp.png


 81%|████████  | 1616/2000 [15:03<03:44,  1.71it/s]libpng error: Read Error


Converting img..  ./temp.png


 82%|████████▏ | 1630/2000 [15:11<03:27,  1.78it/s]libpng error: Read Error


Converting img..  ./temp.png


 83%|████████▎ | 1655/2000 [15:25<03:14,  1.77it/s]libpng error: Read Error


Converting img..  ./temp.png


 88%|████████▊ | 1769/2000 [16:29<02:08,  1.80it/s]libpng error: Read Error


Converting img..  ./temp.png


 89%|████████▉ | 1782/2000 [16:36<02:01,  1.80it/s]libpng error: Read Error


Converting img..  ./temp.png


 92%|█████████▏| 1840/2000 [17:09<01:27,  1.83it/s]libpng error: Read Error


Converting img..  ./temp.png


 93%|█████████▎| 1852/2000 [17:15<01:21,  1.82it/s]libpng error: Read Error


Converting img..  ./temp.png


 93%|█████████▎| 1867/2000 [17:24<01:14,  1.79it/s]libpng error: Read Error


Converting img..  ./temp.png


 94%|█████████▍| 1879/2000 [17:30<01:07,  1.80it/s]libpng error: Read Error


Converting img..  ./temp.png


 94%|█████████▍| 1884/2000 [17:33<01:05,  1.78it/s]libpng error: Read Error


Converting img..  ./temp.png


 97%|█████████▋| 1934/2000 [18:01<00:36,  1.79it/s]libpng error: Read Error


Converting img..  ./temp.png


 97%|█████████▋| 1936/2000 [18:02<00:36,  1.77it/s]libpng error: Read Error


Converting img..  ./temp.png


 98%|█████████▊| 1954/2000 [18:13<00:25,  1.81it/s]libpng error: Read Error


Converting img..  ./temp.png


 99%|█████████▊| 1972/2000 [18:23<00:15,  1.82it/s]libpng error: Read Error


Converting img..  ./temp.png


100%|█████████▉| 1992/2000 [18:34<00:04,  1.78it/s]libpng error: Read Error


Converting img..  ./temp.png


100%|██████████| 2000/2000 [18:38<00:00,  1.79it/s]


In [14]:

# Read the intermediate file into a dataframe
intermediate_df = pd.read_csv(intermediate_file)

# Use 'id' as the key to update the results_pls dataframe with predicted quantities
# Assuming that results_pls already contains the ids, sample_id, and sample_name
results_pls.set_index('id', inplace=True)
intermediate_df.set_index('id', inplace=True)

# Update only the 'pred_quantity' column in results_pls where 'id' matches
results_pls.update(intermediate_df[['pred_quantity']])

# Reset index after update (if needed)
results_pls.reset_index(inplace=True)

# Save the final results to a CSV file
results_pls.to_csv('../data/final-results__gt-api__pls-quantity.csv', index=False)

In [12]:
import numpy as np
from sklearn.metrics import mean_squared_error
import pandas as pd

results_pls = pd.read_csv('../data/final-results__gt-api__pls-quantity.csv')

# Exclude 'lactose' from the RMSE calculation
filtered_results_pls = results_pls[results_pls['actual_class'] != 'lactose']

# Calculate the RMSE between actual_quantity and pred_quantity for the filtered data
rmse_filtered = np.sqrt(mean_squared_error(filtered_results_pls['actual_quantity'].astype(int), filtered_results_pls['pred_quantity'].astype(int)))

# Print the RMSE for the filtered data
print(f"RMSE (excluding 'lactose'): {rmse_filtered}")

RMSE (excluding 'lactose'): 18.096877431059568


In [20]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Calculate the RMSE between actual_quantity and pred_quantity
rmse = np.sqrt(mean_squared_error(results_pls['actual_quantity'].astype(int), results_pls['pred_quantity'].astype(int)))

# Print the RMSE
print(f"RMSE: {rmse}")

RMSE: 20.404864616066433


In [11]:
# Function to calculate RMSE
def calculate_rmse(group, pred_col='pred_quantity', actual_col='actual_quantity'):
    actual = group[actual_col].astype(int)
    predicted = group[pred_col].astype(int)
    return np.sqrt(np.mean((actual - predicted) ** 2))


In [28]:
# Grouping by 'actual_class' and applying the RMSE calculation
rmse_by_class = results_pls.groupby('actual_class').apply(calculate_rmse)

# Display the RMSE for each actual_class
display(rmse_by_class)

  rmse_by_class = results_pls.groupby('actual_class').apply(calculate_rmse)


actual_class
albendazole                    26.987383
amoxicillin                    21.468952
ampicillin                     21.515612
azithromycin                   16.158185
benzyl-penicillin              25.205311
ceftriaxone                    15.350738
chloroquine                    16.584670
ciprofloxacin                  23.281069
doxycycline                    18.537720
epinephrine                    17.309887
ethambutol                     17.032209
ferrous-sulfate                17.331646
hydroxychloroquine             14.861458
isoniazid                      15.019094
lactose                       101.000000
promethazine-hydrochloride     18.335132
pyrazinamide                   17.776891
rifampicin                     19.451706
ripe                           15.096387
sulfamethoxazole               14.467238
tetracycline                   16.621427
dtype: float64

21