In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os

In [2]:
# read in tsv file output from mmseqs

clusterDataFile = '../cluster/clusterA05Res_cluster.tsv'

############################################################################
cdata = pd.read_csv(clusterDataFile,sep='\t', names=['cluster','id'])
print(cdata)
print(len(cdata.cluster))
print(len(cdata.cluster.unique()) )

             cluster             id
0      S.Aci16581III  S.Aci16581III
1               AclI           AclI
2        S.Aco12261I    S.Aco12261I
3         Aco12261II     Aco12261II
4          S.AcoA42I      S.AcoA42I
...              ...            ...
10809      M.PfrJS2V      M.PfrJS2V
10810     S.PfrJS7II     S.PfrJS7II
10811    S.PfrJS7III    S.PfrJS7III
10812    S.PfrJS7III    S.PfrJS2III
10813     M.PfrJS7IV     M.PfrJS7IV

[10814 rows x 2 columns]
10814
4271


In [3]:

clusterSet = set(cdata.cluster)
print(len(clusterSet))

4271


In [4]:
# read data 

dataDir = '.'
csvFile = 'protein_seqs_cleaned_6types.csv'

dataDf = pd.read_csv( os.path.join( dataDir, csvFile ) )

print(dataDf)
print(dataDf.describe())
print(len(dataDf))
print(len(dataDf['RE'].unique()))
RESet = set(dataDf['RE'])

                 RE                        type            site  \
0        M.Aac9709I   Type II methyltransferase            GATC   
1          M.AacDam   Type II methyltransferase            GATC   
2        M.AalSMS7I  Type III methyltransferase          TGAATC   
3       M.AalSMS7II    Type I methyltransferase     AGCNNNNRTCA   
4       S.AalSMS7II  Type I specificity subunit     AGCNNNNRTCA   
...             ...                         ...             ...   
10809       M.ZmoII    Type I methyltransferase  GAAGNNNNNNNTCC   
10810       S.ZmoII  Type I specificity subunit  GAAGNNNNNNNTCC   
10811   M.Zmo29192I    Type I methyltransferase   ACANNNNNNRTGG   
10812   S.Zmo29192I  Type I specificity subunit   ACANNNNNNRTGG   
10813  M.Zmo29192II   Type II methyltransferase          GATATC   

                                                sequence  
0      MPEPAKPATPAKSRPFLKWAGGKYRLMDEINRLLPKRKQCLVEPFV...  
1      MPEPAKPATPAKSRPFLKWAGGKYRLMDEINRLLPKRKQCLVEPFV...  
2      MSKEKETTENP

In [7]:
# a bool array true if dataDf entry has an RE name in the cluster name set
# note- cluster names are representative sequences from cluster
nameInSet = dataDf['RE'].map( lambda x: x in clusterSet )
print(len(RESet))
print(len(nameInSet))
nameInSet

10814
10814


0        False
1        False
2         True
3        False
4         True
         ...  
10809    False
10810    False
10811    False
10812     True
10813    False
Name: RE, Length: 10814, dtype: bool

In [8]:
# note: clustering was done on psc6 data, with 10814 sequences.
# all cluster names should be present in psc6 data 
# both should give 4271

print(np.array(nameInSet).sum())  # should be 4271 TRUEs
print(len(clusterSet))
namesPresent = set(dataDf['RE'][nameInSet])


4271
4271


In [9]:
# should be empty
print(clusterSet.difference(namesPresent))


set()


In [10]:
# create a dataframe containing entries for representative enzymes
repDf = dataDf[ nameInSet ]

In [11]:
repDf

Unnamed: 0,RE,type,site,sequence
2,M.AalSMS7I,Type III methyltransferase,TGAATC,MSKEKETTENPMEKVQSHDWNKERLEQLKQLMPDLFTNDGALNINE...
4,S.AalSMS7II,Type I specificity subunit,AGCNNNNRTCA,MQSNYRPIGDYIQLVDERNVDLQVTTLLGLSISKQFIHSVANTVGT...
5,M.AalSMS7III,Type I methyltransferase,GAGNNNNNNNGTG,MANVGAVISSIRNIMRQDRGISGDAQRLEQLGWMLFLKIMDDKDQE...
6,S.AalSMS7III,Type I specificity subunit,GAGNNNNNNNGTG,MAKEYLIGDFLKRIKRPIQLIDDQEYKLVTIKMNHNGVVLREHKKG...
9,S.AalSMS7IV,Type I specificity subunit,TCCNNNNNTTCC,MEEIMEKAVSELKLDKSRWSLTKFGNVAIQQKQSVDRDISELTRYV...
...,...,...,...,...
10800,M.Yre11966III,Type II methyltransferase,TGGCCA,MFYSPLRYPGGKSKLTAYVLEIIKLNTLEGGTYVEPFAGGCAIAWY...
10805,S.ZalSM2I,Type I specificity subunit,GAGNNNNNNNTTCC,MSKTDGTINATPQSSHSREGGNLELPTGWRITSLGEICDINPKSKL...
10807,M.ZmoI,Type I methyltransferase,CAGNNNNCTG,MSAEFSEFVLFSEEEIRALDARVTLRESRGKQVPYVTCLVRDKEVQ...
10808,S.ZmoI,Type I specificity subunit,CAGNNNNCTG,MKGLDANVGTAREIETVQRFDAEYFRRAYVTCEEAVTQSGRSIRLR...


In [14]:
len(repDf)

4271

In [15]:
# save it here
# output file names --- 'None' if not saving reformated data

combinedFileOutput =  'protein_seqs_cleaned_6types_reps.csv'
fastaFileName = 'protein_seqs_cleaned_6types_reps.fasta'

if combinedFileOutput:
    repDf[['RE', 'type', 'site','sequence' ]].to_csv(combinedFileOutput,index=False)

if fastaFileName:     # ---- Convert to SeqRecord objects ----
    records = []
    for _, row in repDf.iterrows():
        seq = Seq(str(row["sequence"]).strip())
        records.append(
                    SeqRecord(seq, id=str(row["RE"]), description=row.site+' '+row.type)
                    )
        
    # ---- Write to FASTA ----
    SeqIO.write(records, fastaFileName, "fasta")

In [None]:
# now create some selections of sub data bases
# set the min-seq-id
min_seq_id = 0.5

clusterPickFraction = 0.15

#######################################################################

picks = np.random.choice([True, False],numberClusters,p=[clusterPickFraction,1-clusterPickFraction])
remaining = ~picks

pickGroupsSizes = groupCounts[ picks ]
remainingGroupsSizes = groupCounts[ remaining ]

print('\npick groups:\n',pd.Series(pickGroupsSizes).describe())
print('\nremaining groups:\n',pd.Series(remainingGroupsSizes).describe())

In [None]:
# let take a data set made of representative sequence from each cluster

representativePicks =  groupNames[ picks ]
representativeRemaining = groupNames[ remaining ]

print('\nnumber of sequences:', numberSequences )
print('number of clusters:', numberClusters )
print('number of picked sequences:', len(representativePicks), len(representativePicks)/numberSequences )
print('number of remaining sequences:', len(representativeRemaining), len(representativeRemaining)/numberSequences )

        

In [None]:
0.05724061401886443/(0.3343813574995376+0.05724061401886443)

In [None]:
# now how do I make sure site space is covered?

In [None]:
# save it here
# output file names --- 'None' if not saving reformated data

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

combinedFileOutput = None # 'protein_seqs_cleaned_6types.csv'
fastaFileName = None #'protein_seqs_cleaned_6types.fasta'

if combinedFileOutput:
    temp[['RE', 'type', 'site','sequence' ]].to_csv(combinedFileOutput,index=False)

if fastaFileName:     # ---- Convert to SeqRecord objects ----
    records = []
    for _, row in temp.iterrows():
        seq = Seq(str(row["sequence"]).strip())
        records.append(
                    SeqRecord(seq, id=str(row["RE"]), row.site+' '+row.type)
                    )
        
    # ---- Write to FASTA ----
    SeqIO.write(records, fastaFileName, "fasta")