In [25]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import scipy.stats


def PCC_score(y_pred,y_true):
    diff_pred,diff_true=y_pred-np.mean(y_pred),y_true-np.mean(y_true)
    return np.sum(diff_pred*diff_true)/np.sqrt(np.sum(diff_pred**2)*np.sum(diff_true**2))

prediction_dir = 'prediction'
results_score_dir = 'results_score'

# Calculation methods for each regression evaluation index

# MAE = mean_absolute_error(y_true, y_pred)
# MSE = mean_squared_error(y_true, y_pred)
# RMSE = np.sqrt(MSE)
# r2 = r2_score(y_true, y_pred)
# PCC = PCC_score(y_true, y_pred)
# The first element of tau is the Kendall correlation coefficient
# tau = scipy.stats.kendalltau(y_true, y_pred)[0]

In [26]:
def calculate_allele_score():    
    df_results_score = pd.DataFrame(columns=['allele','allele_size', 'tool', 'PCC', 'r2', 'tau', 'MAE', 'MSE', 'RMSE'])

    for file in sorted(os.listdir(prediction_dir)):
        allele = Path(file).stem
        df_allele_prediction = pd.read_csv(os.path.join(prediction_dir, file))

        if not df_allele_prediction.columns.str.contains('VRAPERNet').any():
            continue
        
        y_true = df_allele_prediction['Normalized_QM']
        
        y_pred_list = [tool_score for tool_score in df_allele_prediction.columns if '_BAV_Normalized' in tool_score]
        for tool_score in y_pred_list:
            y_pred = df_allele_prediction[tool_score]
            #Remove NaN and infinity values
            y_true = y_true[np.isfinite(y_true) & np.isfinite(y_pred)].dropna()
            y_pred = y_pred[np.isfinite(y_true) & np.isfinite(y_pred)].dropna()
            MAE = mean_absolute_error(y_true, y_pred)
            MSE = mean_squared_error(y_true, y_pred)
            RMSE = np.sqrt(MSE)
            r2 = r2_score(y_true, y_pred)
            PCC = PCC_score(y_true, y_pred)
            tau = scipy.stats.kendalltau(y_true, y_pred)[0]
            
            tool_name = tool_score.split('_')[0]
            df_results_score = df_results_score.append({'allele': allele, 'allele_size': len(df_allele_prediction), 'tool': tool_name, 'PCC': PCC, 'r2': r2, 'tau': tau, 'MAE': MAE, 'MSE': MSE, 'RMSE': RMSE}, ignore_index=True)
    df_results_score.to_csv(os.path.join(results_score_dir, 'results_score.csv'), index=False)        


In [27]:
def calculate_total_score(): 
    df_results_score = pd.read_csv(os.path.join(results_score_dir, 'results_score.csv'))
    df_total_prediction = pd.DataFrame()

    for file in sorted(os.listdir(prediction_dir)):
        df_allele_prediction = pd.read_csv(os.path.join(prediction_dir, file))
        
        if not df_allele_prediction.columns.str.contains('VRAPERNet').any():
            continue
        
        df_total_prediction = pd.concat([df_total_prediction, df_allele_prediction])
        
    df_total_prediction.to_csv(os.path.join(results_score_dir, 'total_prediction.csv'), index=False)

    y_pred_list = [tool_score for tool_score in df_total_prediction.columns if '_BAV_Normalized' in tool_score]

    for tool_score in y_pred_list:
        df_tool_prediction = df_total_prediction[~pd.isna(df_total_prediction[tool_score])]
        
        y_true = df_tool_prediction['Normalized_QM']
        y_pred = df_tool_prediction[tool_score]
        
        #Remove NaN and infinity values
        y_true = y_true[np.isfinite(y_true) & np.isfinite(y_pred)].dropna()
        y_pred = y_pred[np.isfinite(y_true) & np.isfinite(y_pred)].dropna()
        
        MAE = mean_absolute_error(y_true, y_pred)
        MSE = mean_squared_error(y_true, y_pred)
        RMSE = np.sqrt(MSE)
        r2 = r2_score(y_true, y_pred)
        PCC = PCC_score(y_true, y_pred)
        # The first element of tau is the Kendall correlation coefficient
        tau = scipy.stats.kendalltau(y_true, y_pred)[0]
        
        tool_name = tool_score.split('_')[0]
        
        df_results_score = df_results_score.append({'allele': 'Total allele', 'allele_size': len(df_tool_prediction), 'tool': tool_name, 'PCC': PCC, 'r2': r2, 'tau': tau, 'MAE': MAE, 'MSE': MSE, 'RMSE': RMSE}, ignore_index=True)
    df_results_score.to_csv(os.path.join(results_score_dir, 'results_score.csv'), index=False) 


In [28]:
calculate_allele_score()
calculate_total_score()