In [1]:
import pandas as pd

# Loading AlphaMissense data
##### Updated 06/03/2024
##### Selin Kubali

#### Goal:
Extract AlphaMissense scores for all exomes in each given gene.

#### Required inputs
- MANE guide, found in *selected_genes/hcm/csv_files/MANE.GRCh38.v1.0.select_ensembl_genomic.csv.gz*.
- AlphaMissense csv. Found in *selected_genes/hcm/csv_files/AlphaMissense_hg38.tsv

#### Output
CSV files containing AlphaMissense scores for exomes in each gene. Found in *selected_genes/hcm/alpha_missensense_annotated*.

##### Paths

In [None]:
mane_path = f'/mnt/project/selected_genes/hcm/csv_files/MANE.GRCh38.v1.0.select_ensembl_genomic.csv.gz'
alpha_missense_path = f'/mnt/project/selected_genes/hcm/csv_files/AlphaMissense_hg38.tsv'

In [2]:
exon_flank_nt = 5

### List of gene symbols as input

In [3]:
genes = [ "ACTC1", "ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "MYL2", "MYL3", "PLN", "PTPN11", "TNNI3", "TTR", "TNNT2", "TPM1"]

### Load MANE transcript coordinates

In [None]:
df = pd.read_csv(mane_path)
df = df.loc[(df['feature'] == 'exon') & (df['gene_name'].isin(genes))]
df = df[['seqname', 'start', 'end', 'gene_name']]
df['exon_flank_start'] = df['start'] - exon_flank_nt
df['exon_flank_end'] = df['end'] + exon_flank_nt
df['region'] = ((df['seqname'] + ':').str.cat(df['exon_flank_start'].astype(str)) + '-').str.cat(df['exon_flank_end'].astype(str))
df

In [14]:
dtype_spec = {
"Chrom": str,
"Pos": int,
"Ref": str,
"Alt": str,
"genome": str,
"uniprot_id": str,
"transcript_id": str,
"protein_variant": str,
"am_pathogenicity": float,
"am_class": str
}
ai_scores = pd.read_csv(alpha_missense_path, sep = "\t", skiprows = 4, names = ["Chrom", "Pos", "Ref", "Alt", "genome", "uniprot_id", "transcript_id", "protein_variant", "am_pathogenicity", "am_class"],usecols=["Chrom", "Pos", "Ref", "Alt","am_pathogenicity"])

In [16]:
def in_ranges(pos, ranges):
    for start, end in ranges:
        if start <= pos <= end:
            return True
    return False

In [17]:
for gene in genes:
    df_gene = df.loc[df['gene_name'] == gene]
    if df_gene.shape[0] > 0:
        
        combined = list(zip(df_gene['exon_flank_start'], df_gene['exon_flank_end']))
        ai_scores_gene = ai_scores[ai_scores['Pos'].apply(lambda x: in_ranges(x, combined))]
        ai_scores_gene.to_csv(gene+"_alphamissense.csv")

In [None]:
!dx mkdir -p "Cassa Lab Shared Project:/selected_genes/hcm/alpha_missensense_annotated"
!dx cd /selected_genes/hcm/alpha_missensense_annotated
!dx upload *_alphamissense.csv