In [14]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import scipy.stats


def PCC_score(y_pred,y_true):
    diff_pred,diff_true=y_pred-np.mean(y_pred),y_true-np.mean(y_true)
    return np.sum(diff_pred*diff_true)/np.sqrt(np.sum(diff_pred**2)*np.sum(diff_true**2))

prediction_dir = 'prediction'

# Calculation methods for each regression evaluation index

# MAE = mean_absolute_error(y_true, y_pred)
# MSE = mean_squared_error(y_true, y_pred)
# RMSE = np.sqrt(MSE)
# r2 = r2_score(y_true, y_pred)
# PCC = PCC_score(y_true, y_pred)
# The first element of tau is the Kendall correlation coefficient
# tau = scipy.stats.kendalltau(y_true, y_pred)[0]

In [15]:
def calculate_allele_score():    
    df_results_score = pd.DataFrame(columns=['allele','allele_size', 'PP_diversity', 'PCC', 'r2', 'tau', 'RMSE', 'MAE', 'MSE'])

    for PP_plan in os.listdir(prediction_dir):
        for file in os.listdir(f'{prediction_dir}/{PP_plan}'):
            allele = file.split('.')[0]
            df_allele_prediction = pd.read_csv(os.path.join(prediction_dir, PP_plan, file))         
            y_true = df_allele_prediction['Normalized_QM']
            y_pred = df_allele_prediction['VRAPERNet_BAV_Normalized']
            #Remove NaN and infinity values
            y_true = y_true[np.isfinite(y_true) & np.isfinite(y_pred)].dropna()
            y_pred = y_pred[np.isfinite(y_true) & np.isfinite(y_pred)].dropna()
            MAE = mean_absolute_error(y_true, y_pred)
            MSE = mean_squared_error(y_true, y_pred)
            RMSE = np.sqrt(MSE)
            r2 = r2_score(y_true, y_pred)
            PCC = PCC_score(y_true, y_pred)
            tau = scipy.stats.kendalltau(y_true, y_pred)[0]
            df_results_score = df_results_score.append({'allele': allele, 'allele_size': len(df_allele_prediction), 'PP_diversity': PP_plan, 'PCC': PCC, 'r2': r2, 'tau': tau, 'MAE': MAE, 'MSE': MSE, 'RMSE': RMSE}, ignore_index=True)
    df_results_score.to_csv('results_score.csv', index=False)        


In [16]:
calculate_allele_score()