In [1]:
import sys
sys.path.insert(1, '../scripts')

In [6]:
import numpy as np
import pandas as pd
from Bio import AlignIO

# import custom scripts
import localcider
from localcider.sequenceParameters import SequenceParameters
import fractional_charge as fc
import charge_distribution as cd
import parseaf as pa
import crutil

## Trimming

In [3]:
df = pd.read_csv('../../data/charged_regions/cr_raw.csv', comment='#')
df.head(5)

Unnamed: 0,orf,gene,seq.len,left.bound,right.bound,region.seq,region.len,charge.asymmetry,frac.charge,kappa1,kappa2,uni_id,orf_label
0,YAL011W,SWC3,626,0,54,MPAVLRTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDD...,55,0.015674,0.527273,0.297167,0.616928,P31376,verified
1,YAL011W,SWC3,626,169,265,RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQ...,97,0.050753,0.536082,0.051716,0.110243,P31376,verified
2,YAL011W,SWC3,626,361,424,KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEK...,64,0.007812,0.5,0.044964,0.096912,P31376,verified
3,YAL011W,SWC3,626,470,525,KSVVEFLEDTDEIIISWIVIHNSKEIEKFKTKKIKAKLKADQKLNK...,56,0.0,0.428571,0.114622,0.300249,P31376,verified
4,YAL013W,DEP1,406,81,163,TLTESLKRPHEDEKEAIDEAKKMKVPGENEDESKEEEKSQELEEAI...,83,0.141633,0.542169,0.093375,0.239955,P31385,verified


In [4]:
colnames = list(df.columns)

In [5]:
def trim_region(seq):
    charged_AAs = ['K', 'R', 'D', 'E']
    n_lefttrim, n_righttrim = 0, 0
    for AA in seq:
        if AA not in charged_AAs:
            n_lefttrim += 1
        else:
            break
    for AA in seq[::-1]:
        if AA not in charged_AAs:
            n_righttrim += 1
        else:
            break
    trimmed_seq = seq[n_lefttrim:(len(seq) - n_righttrim)]
    return trimmed_seq, n_lefttrim, n_righttrim


In [6]:
testseq = df.iloc[0]['region.seq']
print(testseq)
print(trim_region(testseq))

MPAVLRTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDDAYDEVGNDY
('RTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDDAYDEVGND', 5, 1)


In [7]:
testseq2 = df.iloc[1]['region.seq']
print(testseq2)
print(trim_region(testseq2))

RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQQMQKKIAKEQKLQLQKENKAKQKLEQEALKLKRKEEMKKLKEQNKNKQGSP
('RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQQMQKKIAKEQKLQLQKENKAKQKLEQEALKLKRKEEMKKLKEQNKNK', 0, 4)


In [8]:
testseq3 = df.iloc[2]['region.seq']
print(testseq3)
print(trim_region(testseq3))

KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEKGVKSEDTQKKEDNQVVPK
('KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEKGVKSEDTQKKEDNQVVPK', 0, 0)


In [9]:
dict_trimmed = []
for index, row in df.iterrows():
    rv = {}
    for name in colnames:
        rv[name] = row[name]
    oldseq = rv['region.seq']
    newseq, n_lefttrim, n_righttrim = trim_region(oldseq)
    rv['left.bound'] = row['left.bound'] + n_lefttrim
    rv['right.bound'] = row['right.bound'] - n_righttrim
    rv['region.seq'] = newseq
    rv['region.len'] = row['region.len'] - (n_lefttrim + n_righttrim)
    rv['charge.asymmetry'] = cd.get_sigma(newseq)
    rv['frac.charge'] = fc.get_fractional_charge(newseq)
    dict_trimmed.append(rv)
df_trimmed = pd.DataFrame.from_records(dict_trimmed)

In [10]:
df_trimmed.drop(['kappa1', 'kappa2'], axis=1, inplace=True)

In [11]:
df_trimmed.head(5)

Unnamed: 0,orf,gene,seq.len,left.bound,right.bound,region.seq,region.len,charge.asymmetry,frac.charge,uni_id,orf_label
0,YAL011W,SWC3,626,5,53,RTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDDAYDEVGND,49,0.017593,0.591837,P31376,verified
1,YAL011W,SWC3,626,169,261,RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQ...,93,0.052936,0.55914,P31376,verified
2,YAL011W,SWC3,626,361,424,KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEK...,64,0.007812,0.5,P31376,verified
3,YAL011W,SWC3,626,470,525,KSVVEFLEDTDEIIISWIVIHNSKEIEKFKTKKIKAKLKADQKLNK...,56,0.000714,0.428571,P31376,verified
4,YAL013W,DEP1,406,84,159,ESLKRPHEDEKEAIDEAKKMKVPGENEDESKEEEKSQELEEAIDSK...,76,0.138444,0.592105,P31385,verified


In [12]:
SeqOb = SequenceParameters("DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVV")
SeqOb.get_kappa() 

0.2112978461212795

In [13]:
def append_kappa(row):
    seq = SequenceParameters(row['region.seq'])
    return seq.get_kappa()

In [14]:
df_trimmed['kappa'] = df_trimmed.apply(lambda row: append_kappa(row), axis=1)

In [15]:
df_trimmed.head(5)

Unnamed: 0,orf,gene,seq.len,left.bound,right.bound,region.seq,region.len,charge.asymmetry,frac.charge,uni_id,orf_label,kappa
0,YAL011W,SWC3,626,5,53,RTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDDAYDEVGND,49,0.017593,0.591837,P31376,verified,0.539053
1,YAL011W,SWC3,626,169,261,RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQ...,93,0.052936,0.55914,P31376,verified,0.098538
2,YAL011W,SWC3,626,361,424,KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEK...,64,0.007812,0.5,P31376,verified,0.089231
3,YAL011W,SWC3,626,470,525,KSVVEFLEDTDEIIISWIVIHNSKEIEKFKTKKIKAKLKADQKLNK...,56,0.000714,0.428571,P31376,verified,0.265395
4,YAL013W,DEP1,406,84,159,ESLKRPHEDEKEAIDEAKKMKVPGENEDESKEEEKSQELEEAIDSK...,76,0.138444,0.592105,P31385,verified,0.219047


In [16]:
len(df_trimmed['orf'].unique())

804

In [17]:
df_trimmed.to_csv('../../data/charged_regions/cr_trimmed_raw.csv', index=False)

## Removing dubious ORFs

In [18]:
dubious_orfs_df = pd.read_table("../../data/sc_orfs/dubious_orfs.tsv", header=None,
                                names=["DBID", "systematic_name", "organism", "standard_name", "gene_name"])

In [19]:
dubious_orfs = dubious_orfs_df["systematic_name"].tolist()

In [20]:
df = pd.read_csv('../../data/charged_regions/cr_trimmed_raw.csv', comment='#')

In [21]:
df = df[~df['orf'].isin(dubious_orfs)]

In [22]:
len(df['gene'].unique())

800

In [23]:
df.to_csv('../../data/charged_regions/cr_trimmed_filtered.csv', index=False)

In [3]:
df = pd.read_csv('../../data/charged_regions/cr_trimmed_filtered.csv', comment='#')

In [4]:
df.head()

Unnamed: 0,orf,gene,seq.len,left.bound,right.bound,region.seq,region.len,charge.asymmetry,frac.charge,uni_id,orf_label,kappa
0,YAL011W,SWC3,626,5,53,RTRSKESSIEQKPASRTRTRSRRGKRGRDDDDDDDDEESDDAYDEVGND,49,0.017593,0.591837,P31376,verified,0.539053
1,YAL011W,SWC3,626,169,261,RLFILKNDKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQ...,93,0.052936,0.55914,P31376,verified,0.098538
2,YAL011W,SWC3,626,361,424,KTAATEPEPKKADDENAEKQQSKEAKTTAESTQVDVKKEEEDVKEK...,64,0.007812,0.5,P31376,verified,0.089231
3,YAL011W,SWC3,626,470,525,KSVVEFLEDTDEIIISWIVIHNSKEIEKFKTKKIKAKLKADQKLNK...,56,0.000714,0.428571,P31376,verified,0.265395
4,YAL013W,DEP1,406,84,159,ESLKRPHEDEKEAIDEAKKMKVPGENEDESKEEEKSQELEEAIDSK...,76,0.138444,0.592105,P31385,verified,0.219047


In [7]:
np.median(df['region.len'])

50.0

## Append AlphaFold prediction

In [31]:
def append_structure_label(row):
    fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/scerevisiae_alphafold/'
    uni_id = row['uni_id']
    left_bound = row['left.bound']
    right_bound = row['right.bound']
    try:
        label = pa.get_structure_label(fdir, uni_id, left_bound, right_bound)
    except:
        print(uni_id)
        label = None
    return label

In [32]:
def append_percent_structure(row):
    uniprot_id = row['uni_id']
    left_bound = row['left.bound']
    right_bound = row['right.bound']
    fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/scerevisiae_alphafold/'
    fpath = fdir + 'AF-' + str(uniprot_id) + '-F1-model_v2.pdb'
    try:
        af_pdb = pa.read_af_output(fdir, uniprot_id)
        ss = md.compute_dssp(af_pdb, simplified=True)[0]
        region_ss = ss[left_bound:(right_bound+1)]
        bfactor = pa.read_bfactor_from_pdb(fpath)[left_bound:(right_bound+1)]
        len_region = right_bound - left_bound + 1

        p_helix = pa.get_percent_helix(region_ss, bfactor, len_region)
        p_disorder = pa.get_percent_disorder(region_ss, bfactor, len_region)
        
        cnt_sheets = 0
        for i, label in enumerate(region_ss):
            if (label == 'E') and (bfactor[i] >= 70):
                cnt_sheets += 1
        p_sheets = cnt_sheets / len_region
    except:
        print(uniprot_id)
        p_helix = None
        p_disorder = None
        p_sheets = None
    return pd.Series(dict(p_helix=p_helix, p_disorder=p_disorder, p_sheets=p_sheets))

In [33]:
df_hc = pd.read_csv('../../data/charged_regions/cr_trimmed_filtered.csv', comment='#')
df_hc['label'] = df_hc.apply(lambda row: append_structure_label(row), axis=1)



P38811
P38811
P36022
Q12019
Q12019
Q12019
Q12019


In [34]:
df_hc = df_hc[df_hc.label != 'unclassified']
df_hc = df_hc.dropna(how='any')
df_hc.to_csv('../../data/charged_regions/cr_trimmed_filtered_aflabel.csv', index=False)

## Extracting regions with a valid AYbRAH MSA

In [3]:
df = pd.read_csv('../../data/charged_regions/cr_trimmed_filtered.csv')

In [4]:
no_valid_msa = []
fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/aybrah-all/'
for index, row in df.iterrows():
    try:
        orf = row['orf']
        msa = AlignIO.read(open(fdir+str(orf)+'-aybrah.fa'), "fasta")
        if len(msa) <= 2:
            no_valid_msa.append(row['orf'])
    except:
        no_valid_msa.append(row['orf'])

In [5]:
len(no_valid_msa)

68

In [6]:
df = df[~df['orf'].isin(no_valid_msa)]

In [7]:
len(df['gene'].unique())

744

## Removing regions with the wrong SC sequence

In [8]:
aybrah_path = '/Users/rosalindpan/drummondlab/hcrpaper_data/aybrah-all/'
wrong_seq = []
for index, row in df.iterrows():
    orf = row['orf']
    msa = AlignIO.read(open(aybrah_path+str(orf)+'-aybrah.fa'), "fasta")
    contains_seq = False
    for record in msa:
        seq = crutil.remove_gaps(record.seq)
        if row['region.seq'] in seq:
            contains_seq = True
    if not contains_seq:
        wrong_seq.append(row['orf'])

In [9]:
df = df[~df['orf'].isin(wrong_seq)]

In [10]:
len(df['gene'].unique())

738

## Remove MSAs with fewer than 10 long sequences

In [12]:
small_msas = []
for index, row in df.iterrows():
    orf = row['orf']
    msa = AlignIO.read(open(aybrah_path+str(orf)+'-aybrah.fa'), "fasta")
    n_long = 0
    long_ids = []
    for record in msa:
        seq = crutil.remove_gaps(record.seq)
        if (len(seq) >= 25) and (record.id not in long_ids): 
            n_long += 1
            long_ids.append(record.id)
    if n_long < 10:
        small_msas.append(row['orf'])

In [13]:
df = df[~df['orf'].isin(small_msas)]

In [15]:
len(df['gene'].unique())

630

In [16]:
df.to_csv('../../data/charged_regions/cr_trimmed_filtered_goodmsa.csv', index=False)