In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os

In [3]:
# read in tsv file output from mmseqs

clusterDataFile = '../cluster/clusterA05Res_cluster.tsv'

############################################################################
cdata = pd.read_csv(clusterDataFile,sep='\t', names=['cluster','id'])
print(cdata)
print(len(cdata.cluster))
print(len(cdata.cluster.unique()) )

             cluster             id
0      S.Aci16581III  S.Aci16581III
1               AclI           AclI
2        S.Aco12261I    S.Aco12261I
3         Aco12261II     Aco12261II
4          S.AcoA42I      S.AcoA42I
...              ...            ...
10809      M.PfrJS2V      M.PfrJS2V
10810     S.PfrJS7II     S.PfrJS7II
10811    S.PfrJS7III    S.PfrJS7III
10812    S.PfrJS7III    S.PfrJS2III
10813     M.PfrJS7IV     M.PfrJS7IV

[10814 rows x 2 columns]
10814
4271


In [5]:

clusterSet = set(cdata.cluster)
print(len(clusterSet))

4271


In [7]:
# read data 

dataDir = '../data'
csvFile = 'protein_seqs_cleaned_6types.csv'

dataDf = pd.read_csv( os.path.join( dataDir, csvFile ) )

print(dataDf)
print(dataDf.describe())
print(len(dataDf))
print(len(dataDf['RE'].unique()))
RESet = set(dataDf['RE'])

                 RE                        type            site  \
0        M.Aac9709I   Type II methyltransferase            GATC   
1          M.AacDam   Type II methyltransferase            GATC   
2        M.AalSMS7I  Type III methyltransferase          TGAATC   
3       M.AalSMS7II    Type I methyltransferase     AGCNNNNRTCA   
4       S.AalSMS7II  Type I specificity subunit     AGCNNNNRTCA   
...             ...                         ...             ...   
10809       M.ZmoII    Type I methyltransferase  GAAGNNNNNNNTCC   
10810       S.ZmoII  Type I specificity subunit  GAAGNNNNNNNTCC   
10811   M.Zmo29192I    Type I methyltransferase   ACANNNNNNRTGG   
10812   S.Zmo29192I  Type I specificity subunit   ACANNNNNNRTGG   
10813  M.Zmo29192II   Type II methyltransferase          GATATC   

                                                sequence  
0      MPEPAKPATPAKSRPFLKWAGGKYRLMDEINRLLPKRKQCLVEPFV...  
1      MPEPAKPATPAKSRPFLKWAGGKYRLMDEINRLLPKRKQCLVEPFV...  
2      MSKEKETTENP

In [15]:
nameInSet = dataDf['RE'].map( lambda x: x in clusterSet )
print(len(RESet))
print(len(nameInSet))
nameInSet

10814
10814


0        False
1        False
2         True
3        False
4         True
         ...  
10809    False
10810    False
10811    False
10812     True
10813    False
Name: RE, Length: 10814, dtype: bool

In [17]:
# note: clustering was done on psc6 data, with 10814 sequences.
# all cluster names should be present in psc6 data 
# why only 4204 names?
print(np.array(nameInSet).sum())
print(len(clusterSet))
namesPresent = set(dataDf['RE'][nameInSet])


4214
4271


In [19]:
print(clusterSet.difference(namesPresent))


{'M.Apa1591IIA', 'Avi249IA', 'BbvCIB', 'M.AgsIA', 'BfaIA', 'BbvCIA', 'Hsi33800VB', 'BspD6IB', 'M.Adh6U21IB', 'BslIB', 'Avi249IB', 'BspD6IA', 'Spn475VA', 'Bpu10IA', 'ScoDS2IIA', 'Hsi33800VC', 'Eco47IA', 'Spn475VB', 'Mbo26IIA', 'BseYIA', 'Eco47IB', 'Csp2014IB', 'M.AquIA', 'Ljo11533IIA', 'Pba2294IB', 'Hsi33800VA', 'S.Maf25IB', 'ScoDS2IIC', 'BsrDIB', 'M.Adh6U21IA', 'BtsIB', 'ScoDS2IIB', 'MtuBlIIB', 'Csp2014IC', 'BtsIA', 'BfaIB', 'BslIA', 'Sma10498IIIA', 'Bpu10IB', 'Mtu2279IIA', 'M.EcoHK31IA', 'Maf25IIA', 'M.AgsIB', 'Maf25IIB', 'Csp2014IA', 'Mtu2279IIB', 'M.AquIB', 'M.Apa1591IIB', 'BsrDIA', 'Ljo11533IIB', 'Sma325IB', 'HpyAXVIIIB', 'BseYIB', 'Maf25IIC', 'Hso63368IIB', 'HpyAXVIIIA', 'M.EcoHK31IB'}


In [24]:
print(list(dataDf[ dataDf.RE.map( lambda x: len(x.split())>1 )].RE))

dataDf[ dataDf.RE.map( lambda x: len(x.split())>1)]

['M.Adh6U21I (M.Adh6U21IA)', 'M.Adh6U21I (M.Adh6U21IB)', 'M.AgsI (M.AgsIA)', 'M.AgsI (M.AgsIB)', 'M.Apa1591II (M.Apa1591IIA)', 'M.Apa1591II (M.Apa1591IIB)', 'M.AquI (M.AquIB)', 'M.AquI (M.AquIA)', 'Avi249I (Avi249IA)', 'Avi249I (Avi249IB)', 'BbvCI (BbvCIA)', 'BbvCI (BbvCIB)', 'BfaI (BfaIA)', 'BfaI (BfaIB)', 'Bpu10I (Bpu10IA)', 'Bpu10I (Bpu10IB)', 'BseYI (BseYIA)', 'BseYI (BseYIB)', 'BslI (BslIA)', 'BslI (BslIB)', 'BspD6I (BspD6IA)', 'BspD6I (BspD6IB)', 'BsrDI (BsrDIA)', 'BsrDI (BsrDIB)', 'BtsI (BtsIA)', 'BtsI (BtsIB)', 'Csp2014I (Csp2014IA)', 'Csp2014I (Csp2014IB)', 'Csp2014I (Csp2014IC)', 'M.EaeI (M.EaeIB)', 'Eco47I (Eco47IA)', 'Eco47I (Eco47IB)', 'M.EcoHK31I (M.EcoHK31IA)', 'M.EcoHK31I (M.EcoHK31IB)', 'HpyAXVIII (HpyAXVIIIA)', 'HpyAXVIII (HpyAXVIIIB)', 'Hsi33800V (Hsi33800VA)', 'Hsi33800V (Hsi33800VB)', 'Hsi33800V (Hsi33800VC)', 'Hso63250IV (Hso63250IVA)', 'Hso63250IV (Hso63250IVB)', 'Hso63368II (Hso63368IIA)', 'Hso63368II (Hso63368IIB)', 'Ljo11533II (Ljo11533IIA)', 'Ljo11533II (Ljo1

Unnamed: 0,RE,type,site,sequence
225,M.Adh6U21I (M.Adh6U21IA),Type II methyltransferase,GAANCAG,MNTNNLKKVAPRARTAFIKAITARAGELGITTKGVSAPTISGDVLQ...
226,M.Adh6U21I (M.Adh6U21IB),Type II methyltransferase,GAANCAG,MATSDNGRFLRFWHEVSAEKLCFDCTSTVESESRAEKWYAYNKGGE...
268,M.AgsI (M.AgsIA),Type II methyltransferase,TTSAA,MKFDVILTNPPFQDSVNRKKTPHKLWIDFTLTVFDRLLVDGGSLVQ...
269,M.AgsI (M.AgsIB),Type II methyltransferase,TTSAA,MSEQLWSRVARELNAHAYMAGVERTVDRVRATGEIFTPTQLVVEML...
390,M.Apa1591II (M.Apa1591IIA),Type I methyltransferase,TGACNNNNNTAC,MPRGRPRKNPDAAPAPKKTTKVSKAKAATATLGFEQQMFLAADKLR...
...,...,...,...,...
9684,Spn475V (Spn475VA),Type IIG restriction enzyme/methyltransferase,TCGAG,MLMTIDISEESLAKESADLLKILLKDRTTKKSIVWATHSYELLGKG...
9685,Spn475V (Spn475VB),Type IIG restriction enzyme/methyltransferase,TCGAG,MPVEIKTTKEIHPKIYAYTTPTVTSNEGWIKIGYTERDVTQRIKEQ...
9699,Spn7466IV (Spn7466IVA),Type IIG restriction enzyme/methyltransferase,TCGAG,MTIDISEESLAKESADLLKILLKDRTTKKSIVWATHSYELLGKGFA...
9716,Spn11891IV (Spn11891IVA),Type IIG restriction enzyme/methyltransferase,TCGAG,MTIDISEESLAKESADLLKILLKDRTTKKSIVWATHSYELLGKGFA...


In [22]:
cdata[ cdata.cluster == 'Spn475V' ]

Unnamed: 0,cluster,id
3162,Spn475V,7400
3163,Spn475V,7401
3164,Spn475V,7736
4494,Spn475V,7062
4495,Spn475V,1317
4496,Spn475V,3170
4497,Spn475V,7063
4498,Spn475V,7398


In [11]:
# create a dataframe containing entries for representative enzymes
repDf = dataDf[ nameInSet ]

In [13]:
repDf

Unnamed: 0,RE,type,site,sequence
2,M.AalSMS7I,Type III methyltransferase,TGAATC,MSKEKETTENPMEKVQSHDWNKERLEQLKQLMPDLFTNDGALNINE...
4,S.AalSMS7II,Type I specificity subunit,AGCNNNNRTCA,MQSNYRPIGDYIQLVDERNVDLQVTTLLGLSISKQFIHSVANTVGT...
5,M.AalSMS7III,Type I methyltransferase,GAGNNNNNNNGTG,MANVGAVISSIRNIMRQDRGISGDAQRLEQLGWMLFLKIMDDKDQE...
6,S.AalSMS7III,Type I specificity subunit,GAGNNNNNNNGTG,MAKEYLIGDFLKRIKRPIQLIDDQEYKLVTIKMNHNGVVLREHKKG...
9,S.AalSMS7IV,Type I specificity subunit,TCCNNNNNTTCC,MEEIMEKAVSELKLDKSRWSLTKFGNVAIQQKQSVDRDISELTRYV...
...,...,...,...,...
10800,M.Yre11966III,Type II methyltransferase,TGGCCA,MFYSPLRYPGGKSKLTAYVLEIIKLNTLEGGTYVEPFAGGCAIAWY...
10805,S.ZalSM2I,Type I specificity subunit,GAGNNNNNNNTTCC,MSKTDGTINATPQSSHSREGGNLELPTGWRITSLGEICDINPKSKL...
10807,M.ZmoI,Type I methyltransferase,CAGNNNNCTG,MSAEFSEFVLFSEEEIRALDARVTLRESRGKQVPYVTCLVRDKEVQ...
10808,S.ZmoI,Type I specificity subunit,CAGNNNNCTG,MKGLDANVGTAREIETVQRFDAEYFRRAYVTCEEAVTQSGRSIRLR...


In [16]:
len(repDf.RE.unique())

4204

In [18]:
# save it here
# output file names --- 'None' if not saving reformated data

combinedFileOutput =  'protein_seqs_cleaned_6types_reps.csv'
fastaFileName = 'protein_seqs_cleaned_6types_reps.fasta'

if combinedFileOutput:
    repDf[['RE', 'type', 'site','sequence' ]].to_csv(combinedFileOutput,index=False)

if fastaFileName:     # ---- Convert to SeqRecord objects ----
    records = []
    for _, row in repDf.iterrows():
        seq = Seq(str(row["sequence"]).strip())
        records.append(
                    SeqRecord(seq, id=str(row["RE"]), description=row.RE+' '+row.site+' '+row.type)
                    )
        
    # ---- Write to FASTA ----
    SeqIO.write(records, fastaFileName, "fasta")

In [None]:
# now create some selections of sub data bases
# set the min-seq-id
min_seq_id = 0.5

clusterPickFraction = 0.15

#######################################################################

picks = np.random.choice([True, False],numberClusters,p=[clusterPickFraction,1-clusterPickFraction])
remaining = ~picks

pickGroupsSizes = groupCounts[ picks ]
remainingGroupsSizes = groupCounts[ remaining ]

print('\npick groups:\n',pd.Series(pickGroupsSizes).describe())
print('\nremaining groups:\n',pd.Series(remainingGroupsSizes).describe())

In [None]:
# let take a data set made of representative sequence from each cluster

representativePicks =  groupNames[ picks ]
representativeRemaining = groupNames[ remaining ]

print('\nnumber of sequences:', numberSequences )
print('number of clusters:', numberClusters )
print('number of picked sequences:', len(representativePicks), len(representativePicks)/numberSequences )
print('number of remaining sequences:', len(representativeRemaining), len(representativeRemaining)/numberSequences )

        

In [None]:
0.05724061401886443/(0.3343813574995376+0.05724061401886443)

In [None]:
# now how do I make sure site space is covered?

In [None]:
# save it here
# output file names --- 'None' if not saving reformated data

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

combinedFileOutput = None # 'protein_seqs_cleaned_6types.csv'
fastaFileName = None #'protein_seqs_cleaned_6types.fasta'

if combinedFileOutput:
    temp[['RE', 'type', 'site','sequence' ]].to_csv(combinedFileOutput,index=False)

if fastaFileName:     # ---- Convert to SeqRecord objects ----
    records = []
    for _, row in temp.iterrows():
        seq = Seq(str(row["sequence"]).strip())
        records.append(
                    SeqRecord(seq, id=str(row["RE"]), row.site+' '+row.type)
                    )
        
    # ---- Write to FASTA ----
    SeqIO.write(records, fastaFileName, "fasta")