In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:
main = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/clinicalAnnotations/clinical_ann_alleles.tsv', sep='\t')
main

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function
0,981755803,AA,Patients with the rs75527207 AA genotype (two ...,
1,981755803,AG,Patients with the rs75527207 AG genotype (one ...,
2,981755803,GG,Patients with the rs75527207 GG genotype (do n...,
3,1449311190,CC,Patients with the CC genotype and Precursor Ce...,
4,1449311190,CT,Patients with the CT genotype and Precursor Ce...,
...,...,...,...,...
15648,1449000354,*64,Patients with the CYP2D6*64 allele may have de...,Uncertain function
15649,1449000354,*65,Patients with the CYP2D6*65 allele may have de...,Uncertain function
15650,1449000354,*70,Patients with the CYP2D6*70 allele may have de...,Uncertain function
15651,1449000354,*71,Patients with the CYP2D6*71 allele may have de...,Uncertain function


In [18]:
map_1 = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/clinicalAnnotations/clinical_ann_evidence.tsv', sep='\t')
map_1.dropna(subset=['PMID'], inplace=True)
map_1['PMID'] = map_1['PMID'].astype('int64')
map_1

Unnamed: 0,Clinical Annotation ID,Evidence ID,Evidence Type,Evidence URL,PMID,Summary,Score
2,981755803,981755665,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/981...,21083385,Genotypes AA + AG are associated with response...,0.25
3,981755803,981755678,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/981...,22047557,Genotypes AA + AG are associated with response...,2
4,981755803,982009991,Variant Drug Annotation,https://www.pharmgkb.org/variantAnnotation/982...,23590265,Allele A is associated with response to ivacaf...,2.25
5,981755803,1043737620,Variant Functional Assay Annotation,https://www.pharmgkb.org/variantAnnotation/104...,23757361,Allele A is associated with increased activity...,0
6,981755803,1043737636,Variant Functional Assay Annotation,https://www.pharmgkb.org/variantAnnotation/104...,23891399,Allele A is associated with activity of CFTR w...,0
...,...,...,...,...,...,...,...
15025,1449000354,1446899430,Variant Functional Assay Annotation,https://www.pharmgkb.org/variantAnnotation/144...,24647041,CYP2D6 *53 is associated with increased cleara...,0
15026,1449000354,1447959334,Variant Functional Assay Annotation,https://www.pharmgkb.org/variantAnnotation/144...,24647041,CYP2D6 *10 + *14 + *17 + *18 + *29 + *30 + *35...,0
15027,1449000354,1448616567,Variant Functional Assay Annotation,https://www.pharmgkb.org/variantAnnotation/144...,24647041,CYP2D6 *22 + *23 + *24 + *27 + *33 + *49 are n...,0
15028,1449000354,1448616608,Variant Functional Assay Annotation,https://www.pharmgkb.org/variantAnnotation/144...,24647041,CYP2D6 *39 + *48 are associated with decreased...,0


In [19]:
map_1 = map_1.groupby('Clinical Annotation ID')['PMID'].agg(lambda x: ', '.join(x.dropna().astype(str).unique())).reset_index()
map_1

Unnamed: 0,Clinical Annotation ID,PMID
0,613976757,"23252947, 19620853, 22188362, 15692831, 254954..."
1,613976848,16538176
2,613977037,16538175
3,613977064,15790597
4,613978931,"26555147, 26314341, 22292851, 22591328, 221883..."
...,...,...
5068,1452050380,"16710319, 22612784, 16874005"
5069,1452050405,"27445478, 33548906, 22480177, 19937159"
5070,1452050600,"24192302, 19937159, 19590397"
5071,1452050620,"31066578, 24192302, 19937159"


In [28]:
map_2 = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/clinicalAnnotations/clinical_ann_history.tsv', sep='\t')
map_2

Unnamed: 0,Clinical Annotation ID,Date (YYYY-MM-DD),Type,Comment
0,981755803,2018-03-28,Update,Added PMID 25145599 to evidence
1,981755803,2018-11-28,Update,Added PMID 23628510 to evidence
2,981755803,2020-11-04,Update,Updated text to match new format. Added guidel...
3,981755803,2021-02-02,Update,Attached CPIC guideline and FDA label for ivac...
4,981755803,2021-02-02,Update,Small edit to text to match template.
...,...,...,...,...
13311,1449000354,2017-10-16,Create,
13312,1449000354,2017-10-17,Update,
13313,1449000354,2018-11-14,Update,re-assigned *14A to *114 and *14B to *14 accor...
13314,1449000354,2021-03-24,Update,CA score added as part of scoring system relea...


In [29]:
# Convert the 'Date' column to a datetime object
map_2['Date (YYYY-MM-DD)'] = pd.to_datetime(map_2['Date (YYYY-MM-DD)'])

# Group by 'ID' and select the maximum date within each group
map_2 = map_2.groupby('Clinical Annotation ID')['Date (YYYY-MM-DD)'].max().reset_index()
map_2

Unnamed: 0,Clinical Annotation ID,Date (YYYY-MM-DD)
0,613976757,2021-03-24
1,613976848,2021-03-24
2,613977037,2021-03-24
3,613977064,2021-03-24
4,613978931,2021-03-24
...,...,...
5068,1452050380,2023-03-27
5069,1452050405,2023-03-27
5070,1452050600,2023-03-27
5071,1452050620,2023-03-27


In [32]:
map_2[map_2['Clinical Annotation ID'] == 981755803]

Unnamed: 0,Clinical Annotation ID,Date (YYYY-MM-DD)
540,981755803,2021-03-24


In [33]:
map_3 = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/clinicalAnnotations/clinical_annotations.tsv', sep='\t')
map_3

Unnamed: 0,Clinical Annotation ID,Variant/Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,981755803,rs75527207,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
1,1449311190,rs4149056,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
2,981204774,rs1799971,OPRM1,4,,,-2.000,Efficacy,2,3,Drugs used in nicotine dependence;nicotine,Tobacco Use Disorder,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,
3,1449191690,rs141033578,CFTR,1A,,Rare Variant; Tier 1 VIP,200.000,Efficacy,1,3,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
4,1449191746,rs78769542,CFTR,1A,,Rare Variant; Tier 1 VIP,200.000,Efficacy,1,3,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5068,1452018886,rs2874116,,3,,,3.000,Efficacy,1,1,cyclosporine,,2023-02-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
5069,1451566760,rs1800497,ANKK1;DRD2,3,,Tier 1 VIP,1.500,Efficacy,1,1,bupropion;naltrexone,Obesity,2021-10-29,https://www.pharmgkb.org/clinicalAnnotation/14...,
5070,1451567040,"CYP3A4*1, CYP3A4*2, CYP3A4*3, CYP3A4*4, CYP3A4...",CYP3A4,3,,Tier 1 VIP,2.500,Metabolism/PK,2,3,oxycodone,,2022-01-13,https://www.pharmgkb.org/clinicalAnnotation/14...,
5071,1184757183,"CYP2D6*1, CYP2D6*7, CYP2D6*10, CYP2D6*12, CYP2...",CYP2D6,3,,Tier 1 VIP,0.000,Other,14,43,bufuralol;dextromethorphan,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/11...,


In [20]:
merge_1 = pd.merge(main, map_1, on = 'Clinical Annotation ID', how = 'left', sort = False)
merge_1

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,PMID
0,981755803,AA,Patients with the rs75527207 AA genotype (two ...,,"21083385, 22047557, 23590265, 23757361, 238913..."
1,981755803,AG,Patients with the rs75527207 AG genotype (one ...,,"21083385, 22047557, 23590265, 23757361, 238913..."
2,981755803,GG,Patients with the rs75527207 GG genotype (do n...,,"21083385, 22047557, 23590265, 23757361, 238913..."
3,1449311190,CC,Patients with the CC genotype and Precursor Ce...,,29683944
4,1449311190,CT,Patients with the CT genotype and Precursor Ce...,,29683944
...,...,...,...,...,...
15648,1449000354,*64,Patients with the CYP2D6*64 allele may have de...,Uncertain function,24647041
15649,1449000354,*65,Patients with the CYP2D6*65 allele may have de...,Uncertain function,24647041
15650,1449000354,*70,Patients with the CYP2D6*70 allele may have de...,Uncertain function,24647041
15651,1449000354,*71,Patients with the CYP2D6*71 allele may have de...,Uncertain function,24647041


In [34]:
merge_2 = pd.merge(merge_1, map_3, on = 'Clinical Annotation ID', how = 'left', sort=False)
merge_2

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,PMID,Variant/Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,981755803,AA,Patients with the rs75527207 AA genotype (two ...,,"21083385, 22047557, 23590265, 23757361, 238913...",rs75527207,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
1,981755803,AG,Patients with the rs75527207 AG genotype (one ...,,"21083385, 22047557, 23590265, 23757361, 238913...",rs75527207,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
2,981755803,GG,Patients with the rs75527207 GG genotype (do n...,,"21083385, 22047557, 23590265, 23757361, 238913...",rs75527207,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
3,1449311190,CC,Patients with the CC genotype and Precursor Ce...,,29683944,rs4149056,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
4,1449311190,CT,Patients with the CT genotype and Precursor Ce...,,29683944,rs4149056,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,1449000354,*64,Patients with the CYP2D6*64 allele may have de...,Uncertain function,24647041,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15649,1449000354,*65,Patients with the CYP2D6*65 allele may have de...,Uncertain function,24647041,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15650,1449000354,*70,Patients with the CYP2D6*70 allele may have de...,Uncertain function,24647041,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15651,1449000354,*71,Patients with the CYP2D6*71 allele may have de...,Uncertain function,24647041,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [36]:
merge_2.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/clinicalAnnotations/mapped_pmid.xlsx', index=False)