In [1]:
import os
import pandas as pd
import numpy as np
from importlib import reload
import src.utils as utils
import glob

In [2]:
chroms_names = ['MT', 'X'] + [str(x) for x in range(1,23)]

GENOMIC_DESCRIPTION_COL = 'Genomic Description (GRCh37)'

In [3]:
chromosome_dir = 'split_vcf_chromosomes_csvs'

dfs = []

for c in chroms_names[1:]:
    print(c)
    df = pd.read_csv(os.path.join(chromosome_dir, f'chromosome_{c}_records.csv'), index_col=0, low_memory=False)
    df = df.map(utils.safe_decode).set_index('Genomic Description GRCh37')
    dfs.append(df.loc[:, ['FINAL_PATHOGENICITY', 'FINAL_PRED', 'CURATED_PATHOGENICITY', 'TOTAL_NUM_PREDS', 'NUM_PATH_PREDS']])


X
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


In [4]:
df = pd.concat(dfs)
df.head()

Unnamed: 0_level_0,FINAL_PATHOGENICITY,FINAL_PRED,CURATED_PATHOGENICITY,TOTAL_NUM_PREDS,NUM_PATH_PREDS
Genomic Description GRCh37,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
X:9431334:T>C,Unknown significance,,,,
X:9431341:T>C,Unknown significance,,,,
X:9431342:G>GT,Unknown significance,,,,
X:9431375:G>A,Unknown significance,,,,
X:9431417:A>G,Unknown significance,,,,


In [5]:
df['PREDICTION_RATIO'] = df['NUM_PATH_PREDS'] / df['TOTAL_NUM_PREDS']
df['INTERNAL_STATE'] = 'VUS'
df.loc[df['TOTAL_NUM_PREDS'] >= 5, 'INTERNAL_STATE'] = 'D'
df.loc[(df['TOTAL_NUM_PREDS'] >= 5) & (df['PREDICTION_RATIO'] < 0.6), 'INTERNAL_STATE'] = 'B'
df.head()

Unnamed: 0_level_0,FINAL_PATHOGENICITY,FINAL_PRED,CURATED_PATHOGENICITY,TOTAL_NUM_PREDS,NUM_PATH_PREDS,PREDICTION_RATIO,INTERNAL_STATE
Genomic Description GRCh37,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
X:9431334:T>C,Unknown significance,,,,,,VUS
X:9431341:T>C,Unknown significance,,,,,,VUS
X:9431342:G>GT,Unknown significance,,,,,,VUS
X:9431375:G>A,Unknown significance,,,,,,VUS
X:9431417:A>G,Unknown significance,,,,,,VUS


In [6]:
df.FINAL_PATHOGENICITY.value_counts()

FINAL_PATHOGENICITY
Unknown significance    2404475
Benign                   227561
Likely benign             38817
Pathogenic                16689
Likely pathogenic          4727
Benign*                      92
Name: count, dtype: int64

In [7]:
df.loc[df.FINAL_PATHOGENICITY == 'Unknown significance', 'FINAL_PATHOGENICITY'] = 'VUS'
df.loc[df.FINAL_PATHOGENICITY == 'Pathogenic', 'FINAL_PATHOGENICITY'] = 'P'
df.loc[df.FINAL_PATHOGENICITY == 'Likely pathogenic', 'FINAL_PATHOGENICITY'] = 'LP'
df.loc[df.FINAL_PATHOGENICITY == 'Benign', 'FINAL_PATHOGENICITY'] = 'B'
df.loc[df.FINAL_PATHOGENICITY == 'Benign*', 'FINAL_PATHOGENICITY'] = 'B'
df.loc[df.FINAL_PATHOGENICITY == 'Likely benign', 'FINAL_PATHOGENICITY'] = 'LB'
df.head()

Unnamed: 0_level_0,FINAL_PATHOGENICITY,FINAL_PRED,CURATED_PATHOGENICITY,TOTAL_NUM_PREDS,NUM_PATH_PREDS,PREDICTION_RATIO,INTERNAL_STATE
Genomic Description GRCh37,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
X:9431334:T>C,VUS,,,,,,VUS
X:9431341:T>C,VUS,,,,,,VUS
X:9431342:G>GT,VUS,,,,,,VUS
X:9431375:G>A,VUS,,,,,,VUS
X:9431417:A>G,VUS,,,,,,VUS


In [8]:
df.FINAL_PATHOGENICITY.value_counts()

FINAL_PATHOGENICITY
VUS    2404475
B       227653
LB       38817
P        16689
LP        4727
Name: count, dtype: int64

In [9]:
df.INTERNAL_STATE.value_counts()

INTERNAL_STATE
VUS    2552823
D        91470
B        48068
Name: count, dtype: int64

In [10]:
df['BVP_INTERNAL_STATE'] = df.INTERNAL_STATE.replace('D', 'P')
df.BVP_INTERNAL_STATE.value_counts()

BVP_INTERNAL_STATE
VUS    2552823
P        91470
B        48068
Name: count, dtype: int64

In [11]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, balanced_accuracy_score

In [12]:
df = df[df['FINAL_PATHOGENICITY'].isin(['B', 'P'])]
df.FINAL_PATHOGENICITY.value_counts()

FINAL_PATHOGENICITY
B    227653
P     16689
Name: count, dtype: int64

In [16]:
df.BVP_INTERNAL_STATE.value_counts()

BVP_INTERNAL_STATE
VUS    235848
P        6969
B        1525
Name: count, dtype: int64

In [19]:
df[['FINAL_PATHOGENICITY', 'BVP_INTERNAL_STATE']].value_counts()

FINAL_PATHOGENICITY  BVP_INTERNAL_STATE
B                    VUS                   225795
P                    VUS                    10053
                     P                       6186
B                    B                       1075
                     P                        783
P                    B                        450
Name: count, dtype: int64

In [22]:
print(f"Azaiez accuracy: {100* accuracy_score(y_true=df['FINAL_PATHOGENICITY'], y_pred=df['BVP_INTERNAL_STATE']):.5f}%")
print(f"Azaiez balanced accuracy: {100*balanced_accuracy_score(y_true=df['FINAL_PATHOGENICITY'], y_pred=df['BVP_INTERNAL_STATE']):.5f}%")
print(f"Azaiez precision_recall_fscore_support: {precision_recall_fscore_support(y_true=df['FINAL_PATHOGENICITY'], y_pred=df['BVP_INTERNAL_STATE'])}")

Azaiez accuracy: 2.97165%




Azaiez balanced accuracy: 18.76927%
Azaiez precision_recall_fscore_support: (array([0.70491803, 0.88764529, 0.        ]), array([0.0047221 , 0.37066331, 0.        ]), array([0.00938135, 0.52295207, 0.        ]), array([227653,  16689,      0]))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
df.BVP_INTERNAL_STATE

Genomic Description GRCh37
X:9431764:C>G          VUS
X:9431784:A>T          VUS
X:9431786:T>C          VUS
X:9431932:G>GAAAAAC    VUS
X:9431950:CAA>C        VUS
                      ... 
22:38379790:A>C          P
22:38379818:C>T        VUS
22:38379877:T>A        VUS
22:38379999:T>A        VUS
22:38380306:A>C        VUS
Name: BVP_INTERNAL_STATE, Length: 244342, dtype: object