In [1]:
from io import StringIO
import numpy as np
import pandas as pd
from subprocess import call
import re

# Loading AlphaMissense data
##### Updated 06/03/2024
##### Selin Kubali

#### Goal:
Extract AlphaMissense scores for all exomes in each given gene.

#### Required inputs
- MANE guide, found in *selected_genes/hcm/csv_files/MANE.GRCh38.v1.0.select_ensembl_genomic.csv.gz*.
- AlphaMissense csv. Found in *selected_genes/hcm/csv_files/AlphaMissense_hg38.tsv

#### Output
CSV files containing AlphaMissense scores for exomes in each gene. Found in *selected_genes/hcm/alpha_missensense_annotated*.

In [2]:
exon_flank_nt = 5

### List of gene symbols as input

In [3]:
genes = ["ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PLN", "PTPN11", "TNNI3", "TTR", "TNNT2", "TPM1", "MYL2", "MYL3", "ACTC1", "JPH2", "FHOD3", "CSRP3", "TRIM63"]

### Load MANE transcript coordinates

In [6]:
df = pd.read_csv(f'/mnt/project/selected_genes/hcm/csv_files/MANE.GRCh38.v1.0.select_ensembl_genomic.csv.gz')
df = df.loc[(df['feature'] == 'exon') & (df['gene_name'].isin(genes))]
df = df[['seqname', 'start', 'end', 'gene_name']]
df['exon_flank_start'] = df['start'] - exon_flank_nt
df['exon_flank_end'] = df['end'] + exon_flank_nt
df['region'] = ((df['seqname'] + ':').str.cat(df['exon_flank_start'].astype(str)) + '-').str.cat(df['exon_flank_end'].astype(str))
df

Unnamed: 0,seqname,start,end,gene_name,exon_flank_start,exon_flank_end,region
21285,chr1,26067336,26067630,TRIM63,26067331,26067635,chr1:26067331-26067635
21288,chr1,26066268,26066440,TRIM63,26066263,26066445,chr1:26066263-26066445
21290,chr1,26061166,26061334,TRIM63,26061161,26061339,chr1:26061161-26061339
21292,chr1,26060266,26060361,TRIM63,26060261,26060366,chr1:26060261-26060366
21294,chr1,26058390,26058623,TRIM63,26058385,26058628,chr1:26058385-26058628
...,...,...,...,...,...,...,...
473557,chr20,44159618,44160407,JPH2,44159613,44160412,chr20:44159613-44160412
473559,chr20,44118505,44118623,JPH2,44118500,44118628,chr20:44118500-44118628
473561,chr20,44115665,44116386,JPH2,44115660,44116391,chr20:44115660-44116391
473563,chr20,44114782,44114876,JPH2,44114777,44114881,chr20:44114777-44114881


In [14]:
dtype_spec = {
"Chrom": str,
"Pos": int,
"Ref": str,
"Alt": str,
"genome": str,
"uniprot_id": str,
"transcript_id": str,
"protein_variant": str,
"am_pathogenicity": float,
"am_class": str
}
ai_scores = pd.read_csv(f'/mnt/project/selected_genes/hcm/csv_files/AlphaMissense_hg38.tsv', sep = "\t", skiprows = 4, names = ["Chrom", "Pos", "Ref", "Alt", "genome", "uniprot_id", "transcript_id", "protein_variant", "am_pathogenicity", "am_class"],usecols=["Chrom", "Pos", "Ref", "Alt","am_pathogenicity"])

In [16]:
def in_ranges(pos, ranges):
    for start, end in ranges:
        if start <= pos <= end:
            return True
    return False

In [17]:
for gene in genes:
    df_gene = df.loc[df['gene_name'] == gene]
    if df_gene.shape[0] > 0:
        
        combined = list(zip(df_gene['exon_flank_start'], df_gene['exon_flank_end']))
        ai_scores_gene = ai_scores[ai_scores['Pos'].apply(lambda x: in_ranges(x, combined))]
        ai_scores_gene.to_csv(gene+"_alphamissense.csv")

In [18]:
!dx mkdir -p "Cassa Lab Shared Project:/selected_genes/hcm/alpha_missensense_annotated"
!dx cd /selected_genes/hcm/alpha_missensense_annotated
!dx upload *_alphamissense.csv

ID                          file-GkGJQPjJqBj55bYxFQxZKJQJ
Class                       file
Project                     project-GGy3Bb0JqBj7zfxY8v4by61X
Folder                      /selected_genes/hcm/alpha_missensense_annotated
Name                        ACTC1_alphamissense.csv
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Thu May 30 19:49:19 2024
Created by                  skubali
 via the job                job-GkGFjv0JqBj8JGKXjqbJy8q4
Last modified               Thu May 30 19:49:20 2024
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"
ID                          file-GkGJQQ0JqBj9GVy87J61FPz3
Class                       file
Project                     project-GGy3Bb0JqBj7zfxY8v4by61X
Folder                      /selected_genes/hcm/alph