In [1]:
import os
os.chdir('../../')

In [None]:
import pandas as pd
import torch
import src.metrics as metrics
from src.utils.fun_retrieval import pseudo_relevance_feedback
from src.utils.seed import set_seed
import csv
set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the .feather file into a Pandas DataFrame
from src.settings import best_runs

start=0
stop=20000
for model,run in best_runs.items():

    train_targets = pd.read_feather(best_runs[model] + '/train_targets.feather')
    train_targets = torch.tensor(train_targets.values, dtype=torch.float32, device=device)
    print('load train_targets finish')

    # val_targets = pd.read_feather(best_runs[model] + '/val_targets.feather')
    # val_targets = torch.tensor(val_targets.values, dtype=torch.float32, device=device)
    # print('load val_targets finish')


    test_targets = pd.read_feather(best_runs[model] + '/test_targets.feather')
    test_targets = torch.tensor(test_targets.values[start:stop], dtype=torch.float32, device=device)
    print('load test_targets finish')

    
    # Merge all into retrieve
    retrieve = torch.cat([train_targets,
                          #val_targets,
                          #test_targets
                        ], dim=0)
    del train_targets
    #del val_targets
    print('merge finish')

    
    loaded_tensor = torch.load(best_runs[model] + '/best_model.pt', map_location='cpu', weights_only=True) 
    db=loaded_tensor['db']
    del loaded_tensor
    print('load db finish')


    number_of_classes=retrieve.shape[1]
    
    metric_collection = metrics.MetricCollection(
        [   
            metrics.AUC(number_of_classes=number_of_classes, average="micro"),
            metrics.AUC(number_of_classes=number_of_classes, average="macro"),
            metrics.F1Score(
                number_of_classes=number_of_classes, average="micro"
            ),
            metrics.F1Score(
                number_of_classes=number_of_classes, average="macro"
            ),
            metrics.ExactMatchRatio(number_of_classes=number_of_classes),
            metrics.Precision_K(k=8, number_of_classes=number_of_classes),
            metrics.Precision_K(k=15, number_of_classes=number_of_classes),
            metrics.PrecisionAtRecall(),
            metrics.MeanAveragePrecision(),
            metrics.Precision(
                number_of_classes=number_of_classes, average="micro"
            ),
            metrics.Recall(
                number_of_classes=number_of_classes, average="micro"
            ),
            metrics.FPR(number_of_classes=test_targets.shape[1])
        ]
    )
    metric_collection.set_threshold(db)
    metric_collection.to(device=device)

    
    alpha=1
    beta=0.1
    gramma=0.0
    TopKSelections=[10,15]
    CosSim_Thresh=0.00
    results = []


    for TopKSelection in TopKSelections:
        
        predictions_test = pd.read_feather(best_runs[model] +'/predictions_test.feather').iloc[:,:-2]

        # Convert the DataFrame back to a PyTorch tensor
        predictions_test = torch.tensor(predictions_test.values[start:stop], dtype=torch.float32, device=device)
        print('load logits finish')

        batch = {"logits": predictions_test, "targets": test_targets}
        metric_collection.update(batch)
        result = {'model':model,'doc':f'{TopKSelection}','psr':f'{alpha}_{beta}_{gramma}','iteration':0,'TopKSelection':TopKSelection,'CosSim_Thresh':CosSim_Thresh}
        result.update({key: round(value.item() * 100, 1) for key, value in metric_collection.compute(predictions_test, test_targets).items()})
        results.append(result)
        metric_collection.reset()

 
        # # Convert predictions to binary tensor based on threshold 'db'
        #predictions_test = (predictions_test > db).float()
        #metric_collection.set_threshold(db)
    


        for i in range(1,11):
            
            Rocchio = pseudo_relevance_feedback(retrieve, predictions_test, TopKSelection=TopKSelection,consine_threshold=CosSim_Thresh,
                                                alpha=alpha, beta=beta, gramma=gramma, 
            chunk_size_b=20000)
        
            predictions_test = Rocchio
            batch = {"logits": predictions_test, "targets": test_targets}
            metric_collection.update(batch)
            result = {'model':model,'doc':f'{TopKSelection}','psr':f'{alpha}_{beta}_{gramma}','iteration': i,'TopKSelection':TopKSelection,'CosSim_Thresh':CosSim_Thresh}
            result.update({key: round(value.item() * 100, 1) for key, value in metric_collection.compute(predictions_test, test_targets).items()})
            results.append(result)
            metric_collection.set_threshold(db)
            metric_collection.reset()

                

        # Example dictionary


        # Specify the file name
        filename ='./files/retrieval/pseudo_relevance_feedback.csv'


        # Determine the mode: 'a' for append, 'w' for write (create/overwrite)
        file_exists = os.path.exists(filename)
        mode = 'a' if file_exists else 'w'
        
        # Writing to or appending to CSV
        with open(filename, mode=mode, newline="") as file:
            writer = csv.DictWriter(file, fieldnames=results[0].keys())
            
            # Write header only if the file does not exist
            if not file_exists:
                writer.writeheader()
            
            # Write the data rows
            writer.writerows(results)

        print(f"Data {model}_{TopKSelection}_{beta} has been {'appended to' if file_exists else 'written to'} {filename}.")
        results = []

load train_targets finish
load test_targets finish
merge finish
load db finish


  self.threshold = torch.tensor(self.threshold).clone().to(device)


load logits finish


