In [None]:
!pip install scikit-plot
!pip install jax-unirep

Try to deal with GPU memory 
accroading to https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html

In [1]:

import jax
# Global flag to set a specific platform, must be used at startup.
#jax.config.update('jax_platform_name', 'cpu')

import os
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE']='False'
os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION']='1'
os.environ['XLA_PYTHON_CLIENT_ALLOCATOR']='platform'


In [2]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt

from Bio import SeqIO
from jax_unirep import get_reps
from jax_unirep import evotune, fit
from jax_unirep.utils import dump_params


In [3]:
import gc
# import jax 
def clear_jax_caches():
  """Utility to clear all the function caches in jax."""
  # main jit/pmap lu wrapped function caches - have to grab from closures
  jax.xla._xla_callable.__closure__[1].cell_contents.clear()
  jax.pxla.parallel_callable.__closure__[1].cell_contents.clear()
  # primitive callable caches
  jax.xla.xla_primitive_callable.cache_clear()
  jax.xla.primitive_computation.cache_clear()
  # jaxpr caches for control flow and reductions
  jax.lax.lax_control_flow._initial_style_jaxpr.cache_clear()
  jax.lax.lax_control_flow._fori_body_fun.cache_clear()
  jax.lax.lax._reduction_jaxpr.cache_clear()
  # these are trivial and only included for completeness sake
  jax.lax.lax.broadcast_shapes.cache_clear()
  jax.xla.xb.get_backend.cache_clear()
  jax.xla.xb.dtype_to_etype.cache_clear()
  jax.xla.xb.supported_numpy_dtypes.cache_clear()
    
def reset_device_memory(delete_objs=True):
    """Free all tracked DeviceArray memory and delete objects.

  Args:
    delete_objs: bool: whether to delete all live DeviceValues or just free.

  Returns:
    number of DeviceArrays that were manually freed.
  """
    dvals = (x for x in gc.get_objects() if isinstance(x, jax.xla.DeviceArray))
    n_deleted = 0
    for dv in dvals:
    
        if not isinstance(dv, jax.xla.DeviceConstant):
            try: 
                dv._check_if_deleted()  # pylint: disable=protected-access
                dv.device_buffer.delete()
                n_deleted += 1
            except:
                pass
        if delete_objs:

            del dv
    del dvals
    gc.collect()
    return n_deleted

In [6]:
def createREPs(df, filename):
    _h_avg, h_final, c_final= get_reps(df['Sequence'].to_list())
    df.drop(columns=['Sequence'], inplace=True)
    df['reps']=_h_avg.tolist() # if there is a problem , might be here , possible solution is reindexing
    df.to_pickle(filename) # dont forget to change the file name to 0_613834

In [2]:
fastas = "/home/kongkitimanonk/SCRATCH_NOBAK/cd-hit/bk_fasta/SRR1552488.assembly.len15.fasta"
plk = "/home/kongkitimanonk/SCRATCH_NOBAK/phase3/DECockroach.len15.pkl"

In [3]:
with open(fastas) as fasta_file:  # Will close handle cleanly
    identifiers = []
    lengths = []
    seqs = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers.append(seq_record.id)
        # Remove leading and trailing characters from a string
        seqs.append(str(seq_record.seq.strip('*')))
        lengths.append(len(seq_record.seq))

In [4]:
# dictionary of lists  
dict = {'ID': identifiers, 'Sequence': seqs, 'length': lengths}  
df = pd.DataFrame(dict) 
#df["Sequence"] =  seqs
#df.to_pickle(plk)    
df  

Unnamed: 0,ID,Sequence,length
0,0,SSRHLQWDSTLSPHFCF,17
1,1,RGRCIRSSCPSSHLGWR,17
2,2,FCCYIYCMPVQNSIYAS,17
3,3,YSKFSLCQYYFKFCPYST,18
4,4,AADYKPEGLGFDSPWCHWNFSIGNPSGRTM,30
...,...,...,...
894303,894303,WPCTHNGDPCTHFGFSA,17
894304,894304,ETCGNQCNCDCADAGIN,17
894305,894305,LQPLNVLWHCMWKSLCEISECLHNLFPC,28
894306,894306,SCHRSCCKWHPSAVRHM,17


In [5]:
_h_avg, h_final, c_final= get_reps(df['Sequence'].to_list())
df['reps']=_h_avg.tolist()
df

Unnamed: 0,ID,Sequence,length,reps
0,0,SSRHLQWDSTLSPHFCF,17,"[0.03517688065767288, -0.05165969580411911, -0..."
1,1,RGRCIRSSCPSSHLGWR,17,"[0.03560592234134674, -0.08346492797136307, -0..."
2,2,FCCYIYCMPVQNSIYAS,17,"[0.037034764885902405, 0.0007549189031124115, ..."
3,3,YSKFSLCQYYFKFCPYST,18,"[0.03289207071065903, -0.011171547695994377, -..."
4,4,AADYKPEGLGFDSPWCHWNFSIGNPSGRTM,30,"[0.02026762254536152, -0.06562212109565735, -0..."
...,...,...,...,...
894303,894303,WPCTHNGDPCTHFGFSA,17,"[0.037583060562610626, -0.11564307659864426, -..."
894304,894304,ETCGNQCNCDCADAGIN,17,"[0.03682224452495575, -0.07952786237001419, -0..."
894305,894305,LQPLNVLWHCMWKSLCEISECLHNLFPC,28,"[0.022418277338147163, -0.021197671070694923, ..."
894306,894306,SCHRSCCKWHPSAVRHM,17,"[0.03473144769668579, -0.043851062655448914, -..."


Remove Duplicate

In [8]:
df.drop_duplicates(subset=['Sequence'],inplace=True)
df

Unnamed: 0,ID,Sequence,length,reps
0,0,SSRHLQWDSTLSPHFCF,17,"[0.03517688065767288, -0.05165969580411911, -0..."
1,1,RGRCIRSSCPSSHLGWR,17,"[0.03560592234134674, -0.08346492797136307, -0..."
2,2,FCCYIYCMPVQNSIYAS,17,"[0.037034764885902405, 0.0007549189031124115, ..."
3,3,YSKFSLCQYYFKFCPYST,18,"[0.03289207071065903, -0.011171547695994377, -..."
4,4,AADYKPEGLGFDSPWCHWNFSIGNPSGRTM,30,"[0.02026762254536152, -0.06562212109565735, -0..."
...,...,...,...,...
894303,894303,WPCTHNGDPCTHFGFSA,17,"[0.037583060562610626, -0.11564307659864426, -..."
894304,894304,ETCGNQCNCDCADAGIN,17,"[0.03682224452495575, -0.07952786237001419, -0..."
894305,894305,LQPLNVLWHCMWKSLCEISECLHNLFPC,28,"[0.022418277338147163, -0.021197671070694923, ..."
894306,894306,SCHRSCCKWHPSAVRHM,17,"[0.03473144769668579, -0.043851062655448914, -..."


In [7]:
df.to_pickle(plk)
tmp_df = pd.read_pickle(plk)
tmp_df

Unnamed: 0,ID,Sequence,length,reps
0,0,SSRHLQWDSTLSPHFCF,17,"[0.03517688065767288, -0.05165969580411911, -0..."
1,1,RGRCIRSSCPSSHLGWR,17,"[0.03560592234134674, -0.08346492797136307, -0..."
2,2,FCCYIYCMPVQNSIYAS,17,"[0.037034764885902405, 0.0007549189031124115, ..."
3,3,YSKFSLCQYYFKFCPYST,18,"[0.03289207071065903, -0.011171547695994377, -..."
4,4,AADYKPEGLGFDSPWCHWNFSIGNPSGRTM,30,"[0.02026762254536152, -0.06562212109565735, -0..."
...,...,...,...,...
894303,894303,WPCTHNGDPCTHFGFSA,17,"[0.037583060562610626, -0.11564307659864426, -..."
894304,894304,ETCGNQCNCDCADAGIN,17,"[0.03682224452495575, -0.07952786237001419, -0..."
894305,894305,LQPLNVLWHCMWKSLCEISECLHNLFPC,28,"[0.022418277338147163, -0.021197671070694923, ..."
894306,894306,SCHRSCCKWHPSAVRHM,17,"[0.03473144769668579, -0.043851062655448914, -..."


## CD-100

In [4]:
fastas = "/home/ubuntu/data/bk_fasta/SRR1552488.assembly.len15.cd100.fasta"
with open(fastas) as fasta_file:  # Will close handle cleanly
    identifiers = []
    lengths = []
    seqs = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers.append(seq_record.id)
        # Remove leading and trailing characters from a string
        seqs.append(str(seq_record.seq.strip('*')))
        lengths.append(len(seq_record.seq))
# dictionary of lists  
dict = {'ID': identifiers, 'Sequence': seqs, 'length': lengths}  
df = pd.DataFrame(dict) 
#df["Sequence"] =  seqs
#df.to_pickle(plk)    
df  

Unnamed: 0,ID,Sequence,length
0,0,SSRHLQWDSTLSPHFCF,17
1,1,RGRCIRSSCPSSHLGWR,17
2,3,YSKFSLCQYYFKFCPYST,18
3,4,AADYKPEGLGFDSPWCHWNFSIGNPSGRTM,30
4,5,AIPSLPHSRRLWSPHHL,17
...,...,...,...
626564,894298,IHCEEQQCSVRYVWCRR,17
626565,894299,WFDYFVSFHPFNLRNWLA,19
626566,894303,WPCTHNGDPCTHFGFSA,17
626567,894304,ETCGNQCNCDCADAGIN,17


In [5]:
# Setting my chunk size
chunk_size = 2000
# Assigning chunk numbers to rows
df['chunk'] = df['ID'].apply(lambda x: int(int(x)/ chunk_size))
# We don't want the 'chunk' and 'index' columns in the output
cols = [col for col in df.columns if col not in ['chunk']]
# groupby chunk and export each chunk to a different csv.
i = 0
for _, chunk in df.groupby('chunk'):
    chunk[cols].to_csv(f'/mnt/vdb/DECockroach/cd100/chunks/chunk{i}.csv', index=False)
    i += 1
print("complete")

complete


In [7]:
import glob
appended_reps = []
for infile in glob.glob("/mnt/vdb/DECockroach/cd100/reps/*.pkl"):
    # print(infile)
    appended_reps.append(infile)
print(len(appended_reps))

0


In [8]:
for infile in glob.glob("/mnt/vdb/DECockroach/cd100/chunks/*.csv"):
    #print("Read:"+infile)
    file_name = os.path.basename(infile)
    result="/mnt/vdb/DECockroach/cd100/reps/"+file_name.replace("csv", "pkl")
    if result in appended_reps :
        # print("found then skip : " , result)
        continue
    else:
        df = pd.read_csv(infile) 
        createREPs(df,result)
        print("Save:"+result)
        reset_device_memory()
        clear_jax_caches()
print("complete")

Save:/mnt/vdb/DECockroach/cd100/reps/chunk124.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk50.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk155.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk214.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk10.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk396.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk251.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk352.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk171.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk415.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk349.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk395.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk130.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk300.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk147.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk135.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk245.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk73.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk123.pkl
Save:/mnt/vdb/DECockroach/cd100/reps/chunk133.pkl
Sav

## Transpi

In [5]:
fastas = "/mnt/vdb/DECockroach/transpi/SRR1552488.combined.okay.fa.transdecoder.pep"
with open(fastas) as fasta_file:  # Will close handle cleanly
    identifiers = []
    lengths = []
    seqs = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers.append(seq_record.id)
        # Remove leading and trailing characters from a string
        seqs.append(str(seq_record.seq.strip('*')))
        lengths.append(len(seq_record.seq))
# dictionary of lists  
dict = {'ID': identifiers, 'Sequence': seqs, 'length': lengths}  
df = pd.DataFrame(dict) 
#df["Sequence"] =  seqs
#df.to_pickle(plk)    
df  

Unnamed: 0,ID,Sequence,length
0,SOAP.k25.C372231.p1,VYYRRDGKGDKEYWTCQKKPECKATAITIRTGDTVTILKESDHWHA...,102
1,SOAP.k25.C373809.p1,KTRLTVVGTKVINEKNNVKLKGVSKVVSLHVYRLAPDTTIEELTEY...,104
2,SOAP.k25.C379695.p1,MLRDYREIGNLVLCFDTPFTVDFKVIQDAALQKELIEFRCDRRLRE...,95
3,SOAP.k25.C382451.p1,PSPCGANAVCREQNGAGSCTCLPDYVGNPYEGCRPECVLNTDCPSN...,116
4,SOAP.k25.C383053.p1,FKMLTMPRRDICQIETLNLADPLMFLVRNRVCTSTMFHLLPFSYTS...,104
...,...,...,...
17554,Velvet.k37.NODE_9552_length_1035_cov_12.333333.p1,EPKLVNEVNLTFHEKDGEEFMALDKNLKVTTTVKRVYMHLTNLFNG...,102
17555,Velvet.k37.NODE_9713_length_2207_cov_17.057997.p1,MIRRWWKLFMFIMAMLLDVREAFYVPGVAPVEFRKGARIDVKAVKM...,627
17556,Velvet.k37.NODE_9748_length_894_cov_40.512302.p1,MLNFSHHVSKTIRKKKSSKITGRFSRYKKMRTCSSLNEIYIVYIYI...,124
17557,Velvet.k37.NODE_9800_length_726_cov_45.530304.p1,MASFEQAIQQNVMQVAKKVEEHLDAELEKLEKLDSDDLDKLREKRL...,221


In [6]:
NON_CODE = "B|Z|J|U|O"
# remove ambigous seqeunce and non-canonical amino acids codes
df = df[~df["Sequence"].str.contains(NON_CODE, regex=True)]
df

Unnamed: 0,ID,Sequence,length
0,SOAP.k25.C372231.p1,VYYRRDGKGDKEYWTCQKKPECKATAITIRTGDTVTILKESDHWHA...,102
1,SOAP.k25.C373809.p1,KTRLTVVGTKVINEKNNVKLKGVSKVVSLHVYRLAPDTTIEELTEY...,104
2,SOAP.k25.C379695.p1,MLRDYREIGNLVLCFDTPFTVDFKVIQDAALQKELIEFRCDRRLRE...,95
3,SOAP.k25.C382451.p1,PSPCGANAVCREQNGAGSCTCLPDYVGNPYEGCRPECVLNTDCPSN...,116
4,SOAP.k25.C383053.p1,FKMLTMPRRDICQIETLNLADPLMFLVRNRVCTSTMFHLLPFSYTS...,104
...,...,...,...
17554,Velvet.k37.NODE_9552_length_1035_cov_12.333333.p1,EPKLVNEVNLTFHEKDGEEFMALDKNLKVTTTVKRVYMHLTNLFNG...,102
17555,Velvet.k37.NODE_9713_length_2207_cov_17.057997.p1,MIRRWWKLFMFIMAMLLDVREAFYVPGVAPVEFRKGARIDVKAVKM...,627
17556,Velvet.k37.NODE_9748_length_894_cov_40.512302.p1,MLNFSHHVSKTIRKKKSSKITGRFSRYKKMRTCSSLNEIYIVYIYI...,124
17557,Velvet.k37.NODE_9800_length_726_cov_45.530304.p1,MASFEQAIQQNVMQVAKKVEEHLDAELEKLEKLDSDDLDKLREKRL...,221


In [7]:
#df["Sequence"] = df["Sequence"].str.replace("*","") # match 
len(df[df["Sequence"].str.contains("*", regex=False)])

0

In [8]:
# Setting my chunk size
chunk_size = 100
# Assigning chunk numbers to rows
df['chunk'] = df.index.map(lambda x: int(int(x)/ chunk_size))
# We don't want the 'chunk' and 'index' columns in the output
cols = [col for col in df.columns if col not in ['chunk']]
# groupby chunk and export each chunk to a different csv.
i = 0
for _, chunk in df.groupby('chunk'):
    chunk[cols].to_csv(f'/mnt/vdb/DECockroach/transpi/chunks/chunk{i}.csv', index=False)
    i += 1
print("complete")

complete


In [4]:
import glob
appended_reps = []
for infile in glob.glob("/mnt/vdb/DECockroach/transpi/reps/*.pkl"):
    # print(infile)
    appended_reps.append(infile)
print(len(appended_reps))

0


In [None]:
for infile in glob.glob("/mnt/vdb/DECockroach/transpi/chunks/*.csv"):
    #print("Read:"+infile)
    file_name = os.path.basename(infile)
    result="/mnt/vdb/DECockroach/transpi/reps/"+file_name.replace("csv", "pkl")
    if result in appended_reps :
        # print("found then skip : " , result)
        continue
    else:
        df = pd.read_csv(infile) 
        createREPs(df,result)
        print("Save:"+result)
        reset_device_memory()
        clear_jax_caches()
print("complete")

Save:/mnt/vdb/DECockroach/transpi/reps/chunk124.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk50.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk155.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk10.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk171.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk130.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk147.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk135.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk73.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk123.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk133.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk49.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk168.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk175.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk17.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk150.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk151.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk102.pkl
Save:/mnt/vdb/DECockroach/transpi/reps/chunk36.pkl
Save:/mnt/vdb/DECo