In [1]:
import sys
sys.path.insert(1, '../scripts')

In [2]:
import numpy as np
import pandas as pd
from Bio import AlignIO
import crutil

## Loading list of dubious, uncharacterized, and verified ORFs

In [3]:
dubious_orfs_df = pd.read_table("../../data/sc_orfs/dubious_orfs.tsv", header=None,
                                names=["DBID", "systematic_name", "organism", "standard_name", "gene_name"])

In [4]:
dubious_orfs = dubious_orfs_df["systematic_name"].tolist()

In [5]:
len(dubious_orfs)

688

## Removing dubious ORFs

In [6]:
df = pd.read_csv('../../data/charged_regions/cr_trimmed_raw.csv', comment='#')

In [7]:
df.head()

Unnamed: 0,orf,gene,seq.len,left.bound,right.bound,region.seq,region.len,charge.asymmetry,frac.charge,uni_id,orf_label,kappa
0,YAL011W,SWC3,626,5,53,RTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDDAYDEVGND,49,0.017593,0.591837,P31376,verified,0.539053
1,YAL011W,SWC3,626,169,261,RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQ...,93,0.052936,0.55914,P31376,verified,0.098538
2,YAL011W,SWC3,626,361,424,KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEK...,64,0.007812,0.5,P31376,verified,0.089231
3,YAL011W,SWC3,626,470,525,KSVVEFLEDTDEIIISWIVIHNSKEIEKFKTKKIKAKLKADQKLNK...,56,0.000714,0.428571,P31376,verified,0.265395
4,YAL013W,DEP1,406,84,159,ESLKRPHEDEKEAIDEAKKMKVPGENEDESKEEEKSQELEEAIDSK...,76,0.138444,0.592105,P31385,verified,0.219047


In [9]:
df[~df['orf'].isin(dubious_orfs)]

Unnamed: 0,orf,gene,seq.len,left.bound,right.bound,region.seq,region.len,charge.asymmetry,frac.charge,uni_id,orf_label,kappa
0,YAL011W,SWC3,626,5,53,RTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDDAYDEVGND,49,0.017593,0.591837,P31376,verified,0.539053
1,YAL011W,SWC3,626,169,261,RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQ...,93,0.052936,0.559140,P31376,verified,0.098538
2,YAL011W,SWC3,626,361,424,KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEK...,64,0.007812,0.500000,P31376,verified,0.089231
3,YAL011W,SWC3,626,470,525,KSVVEFLEDTDEIIISWIVIHNSKEIEKFKTKKIKAKLKADQKLNK...,56,0.000714,0.428571,P31376,verified,0.265395
4,YAL013W,DEP1,406,84,159,ESLKRPHEDEKEAIDEAKKMKVPGENEDESKEEEKSQELEEAIDSK...,76,0.138444,0.592105,P31385,verified,0.219047
...,...,...,...,...,...,...,...,...,...,...,...,...
1046,YPR163C,TIF3,437,382,430,DVLRTEDDDEDEEAEKQNGDAKENKVDAAVEKLQDKTAQLTVEDGDNWE,49,0.137959,0.510204,P34167,verified,0.328774
1047,YPR169W,JIP5,493,439,490,KEELLDELDKDLKEDHQEEKESNSKSVKKRKIMKENNKKKDLYEHG...,52,0.000583,0.596154,Q06214,verified,0.199050
1048,YPR179C,HDA3,656,513,573,ETLSKLKDAFVKTDNVQDEIEKEERVSVSRDTEKKYMEQEIKRAVD...,61,0.008197,0.508197,Q06623,verified,0.096199
1049,YPR186C,PZF1,430,373,409,RTFKTKEKYEKHIDKHKVHELKLKILQEKEENKTLVD,37,0.044226,0.513514,P39933,verified,0.050618


In [13]:
len(df['gene'].unique())

804

In [None]:
df.to_csv('../../data/charged_regions/cr_trimmed_filtered.csv', index=False)

## Extracting regions with a valid AYbRAH MSA

In [20]:
all_orfs = pd.read_table('../../data/sc_orfs/yeast-all-orfs.txt', header=None, names=['orf'])

In [21]:
no_valid_msa = []
fdir = '/mnt/d/research/drummond-lab/data/aybrah-all/'
for index, row in df.iterrows():
    try:
        orf = row['orf']
        msa = AlignIO.read(open(fdir+str(orf)+'-aybrah.fa'), "fasta")
        if len(msa) <= 2:
            no_valid_msa.append(index)
    except:
        no_valid_msa.append(index)

In [22]:
len(no_valid_msa)

64

In [23]:
df.drop(df.index[no_valid_msa], inplace=True)
df = df.reset_index(drop=True)

In [24]:
df

Unnamed: 0,orf,gene,seq.len,left.bound,right.bound,region.seq,region.len,charge.asymmetry,frac.charge,kappa1,kappa2,uni_id,orf_label
0,YAL011W,SWC3,626,0,54,MPAVLRTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDD...,55,0.015674,0.527273,0.297167,0.616928,P31376,verified
1,YAL011W,SWC3,626,169,265,RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQ...,97,0.050753,0.536082,0.051716,0.110243,P31376,verified
2,YAL011W,SWC3,626,361,424,KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEK...,64,0.007812,0.500000,0.044964,0.096912,P31376,verified
3,YAL011W,SWC3,626,470,525,KSVVEFLEDTDEIIISWIVIHNSKEIEKFKTKKIKAKLKADQKLNK...,56,0.000000,0.428571,0.114622,0.300249,P31376,verified
4,YAL013W,DEP1,406,81,163,TLTESLKRPHEDEKEAIDEAKKMKVPGENEDESKEEEKSQELEEAI...,83,0.141633,0.542169,0.093375,0.239955,P31385,verified
...,...,...,...,...,...,...,...,...,...,...,...,...,...
947,YPR163C,TIF3,437,379,433,SVYDVLRTEDDDEDEEAEKQNGDAKENKVDAAVEKLQDKTAQLTVE...,55,0.122909,0.454545,0.115449,0.366767,P34167,verified
948,YPR169W,JIP5,493,434,491,LVGLSKEELLDELDKDLKEDHQEEKESNSKSVKKRKIMKENNKKKD...,58,0.000556,0.534483,0.111450,0.219531,Q06214,verified
949,YPR179C,HDA3,656,512,577,GETLSKLKDAFVKTDNVQDEIEKEERVSVSRDTEKKYMEQEIKRAV...,66,0.012219,0.469697,0.045506,0.106750,Q06623,verified
950,YPR186C,PZF1,430,370,411,NCSRTFKTKEKYEKHIDKHKVHELKLKILQEKEENKTLVDQN,42,0.011278,0.452381,0.031155,0.081428,P39933,verified


## Removing regions with the wrong SC sequence

In [25]:
aybrah_path = '/mnt/d/research/drummond-lab/data/aybrah-all/'
wrong_seq = []
for index, row in df.iterrows():
    orf = row['orf']
    msa = AlignIO.read(open(aybrah_path+str(orf)+'-aybrah.fa'), "fasta")
    contains_seq = False
    for record in msa:
        seq = crutil.remove_gaps(record.seq)
        if row['region.seq'] in seq:
            contains_seq = True
    if not contains_seq:
        wrong_seq.append(index)

In [26]:
df.drop(df.index[wrong_seq], inplace=True)
df = df.reset_index(drop=True)

In [27]:
df

Unnamed: 0,orf,gene,seq.len,left.bound,right.bound,region.seq,region.len,charge.asymmetry,frac.charge,kappa1,kappa2,uni_id,orf_label
0,YAL011W,SWC3,626,0,54,MPAVLRTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDD...,55,0.015674,0.527273,0.297167,0.616928,P31376,verified
1,YAL011W,SWC3,626,169,265,RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQ...,97,0.050753,0.536082,0.051716,0.110243,P31376,verified
2,YAL011W,SWC3,626,361,424,KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEK...,64,0.007812,0.500000,0.044964,0.096912,P31376,verified
3,YAL011W,SWC3,626,470,525,KSVVEFLEDTDEIIISWIVIHNSKEIEKFKTKKIKAKLKADQKLNK...,56,0.000000,0.428571,0.114622,0.300249,P31376,verified
4,YAL013W,DEP1,406,81,163,TLTESLKRPHEDEKEAIDEAKKMKVPGENEDESKEEEKSQELEEAI...,83,0.141633,0.542169,0.093375,0.239955,P31385,verified
...,...,...,...,...,...,...,...,...,...,...,...,...,...
934,YPR163C,TIF3,437,379,433,SVYDVLRTEDDDEDEEAEKQNGDAKENKVDAAVEKLQDKTAQLTVE...,55,0.122909,0.454545,0.115449,0.366767,P34167,verified
935,YPR169W,JIP5,493,434,491,LVGLSKEELLDELDKDLKEDHQEEKESNSKSVKKRKIMKENNKKKD...,58,0.000556,0.534483,0.111450,0.219531,Q06214,verified
936,YPR179C,HDA3,656,512,577,GETLSKLKDAFVKTDNVQDEIEKEERVSVSRDTEKKYMEQEIKRAV...,66,0.012219,0.469697,0.045506,0.106750,Q06623,verified
937,YPR186C,PZF1,430,370,411,NCSRTFKTKEKYEKHIDKHKVHELKLKILQEKEENKTLVDQN,42,0.011278,0.452381,0.031155,0.081428,P39933,verified


In [28]:
df[df.orf == 'YCL011C']

Unnamed: 0,orf,gene,seq.len,left.bound,right.bound,region.seq,region.len,charge.asymmetry,frac.charge,kappa1,kappa2,uni_id,orf_label


In [29]:
df.to_csv('../../data/charged_regions/cr_filtered.csv', index=False)

## Optional clean-up: Remove MSAs with fewer than 10 long sequences

In [30]:
small_msas = []
for index, row in df.iterrows():
    orf = row['orf']
    msa = AlignIO.read(open(aybrah_path+str(orf)+'-aybrah.fa'), "fasta")
    n_long = 0
    long_ids = []
    for record in msa:
        seq = crutil.remove_gaps(record.seq)
        if (len(seq) >= 25) and (record.id not in long_ids): 
            n_long += 1
            long_ids.append(record.id)
    if n_long < 10:
        small_msas.append(index)

In [31]:
df.drop(df.index[small_msas], inplace=True)
df = df.reset_index(drop=True)

In [32]:
df

Unnamed: 0,orf,gene,seq.len,left.bound,right.bound,region.seq,region.len,charge.asymmetry,frac.charge,kappa1,kappa2,uni_id,orf_label
0,YAL011W,SWC3,626,0,54,MPAVLRTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDD...,55,0.015674,0.527273,0.297167,0.616928,P31376,verified
1,YAL011W,SWC3,626,169,265,RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQ...,97,0.050753,0.536082,0.051716,0.110243,P31376,verified
2,YAL011W,SWC3,626,361,424,KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEK...,64,0.007812,0.500000,0.044964,0.096912,P31376,verified
3,YAL011W,SWC3,626,470,525,KSVVEFLEDTDEIIISWIVIHNSKEIEKFKTKKIKAKLKADQKLNK...,56,0.000000,0.428571,0.114622,0.300249,P31376,verified
4,YAL013W,DEP1,406,81,163,TLTESLKRPHEDEKEAIDEAKKMKVPGENEDESKEEEKSQELEEAI...,83,0.141633,0.542169,0.093375,0.239955,P31385,verified
...,...,...,...,...,...,...,...,...,...,...,...,...,...
802,YPR163C,TIF3,437,379,433,SVYDVLRTEDDDEDEEAEKQNGDAKENKVDAAVEKLQDKTAQLTVE...,55,0.122909,0.454545,0.115449,0.366767,P34167,verified
803,YPR169W,JIP5,493,434,491,LVGLSKEELLDELDKDLKEDHQEEKESNSKSVKKRKIMKENNKKKD...,58,0.000556,0.534483,0.111450,0.219531,Q06214,verified
804,YPR179C,HDA3,656,512,577,GETLSKLKDAFVKTDNVQDEIEKEERVSVSRDTEKKYMEQEIKRAV...,66,0.012219,0.469697,0.045506,0.106750,Q06623,verified
805,YPR186C,PZF1,430,370,411,NCSRTFKTKEKYEKHIDKHKVHELKLKILQEKEENKTLVDQN,42,0.011278,0.452381,0.031155,0.081428,P39933,verified
