In [None]:
!pip install scikit-plot
!pip install jax-unirep

Try to deal with GPU memory 
accroading to https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html

In [1]:
import tensorflow as tf
print("Num of GPUs available: ", len(tf.test.gpu_device_name()))

Num of GPUs available:  13


In [1]:
import jax
# Global flag to set a specific platform, must be used at startup.
jax.config.update('jax_platform_name', 'cpu')

import os
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE']='False'
os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION']='.95'
os.environ['XLA_PYTHON_CLIENT_ALLOCATOR']='platform'



In [2]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt

from Bio import SeqIO
from jax_unirep import get_reps
from jax_unirep import evotune, fit
from jax_unirep.utils import dump_params

In [3]:
import gc
def clear_jax_caches():
  """Utility to clear all the function caches in jax."""
  # main jit/pmap lu wrapped function caches - have to grab from closures
  jax.xla._xla_callable.__closure__[1].cell_contents.clear()
  jax.pxla.parallel_callable.__closure__[1].cell_contents.clear()
  # primitive callable caches
  jax.xla.xla_primitive_callable.cache_clear()
  jax.xla.primitive_computation.cache_clear()
  # jaxpr caches for control flow and reductions
  jax.lax.lax_control_flow._initial_style_jaxpr.cache_clear()
  jax.lax.lax_control_flow._fori_body_fun.cache_clear()
  jax.lax.lax._reduction_jaxpr.cache_clear()
  # these are trivial and only included for completeness sake
  jax.lax.lax.broadcast_shapes.cache_clear()
  jax.xla.xb.get_backend.cache_clear()
  jax.xla.xb.dtype_to_etype.cache_clear()
  jax.xla.xb.supported_numpy_dtypes.cache_clear()
    
def reset_device_memory(delete_objs=True):
    """Free all tracked DeviceArray memory and delete objects.

  Args:
    delete_objs: bool: whether to delete all live DeviceValues or just free.

  Returns:
    number of DeviceArrays that were manually freed.
  """
    dvals = (x for x in gc.get_objects() if isinstance(x, jax.xla.DeviceArray))
    n_deleted = 0
    for dv in dvals:
    
        if not isinstance(dv, jax.xla.DeviceConstant):
            try: 
                dv._check_if_deleted()  # pylint: disable=protected-access
                dv.device_buffer.delete()
                n_deleted += 1
            except:
                pass
        if delete_objs:

            del dv
    del dvals
    gc.collect()
    return n_deleted

In [4]:
fastas = "/home/ubuntu/data/bk_fasta/SRR5868581_2.len20.assembly.fasta"
#plk = "/mnt/vdb/phase3/TaiwaneseOolong.len20.pkl"

In [5]:
with open(fastas) as fasta_file:  # Will close handle cleanly
    identifiers = []
    lengths = []
    seqs = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers.append(seq_record.id)
        # Remove leading and trailing characters from a string
        seqs.append(str(seq_record.seq.strip('*')))
        lengths.append(len(seq_record.seq))

In [6]:
# dictionary of lists  
dict = {'ID': identifiers, 'Sequence': seqs, 'length': lengths}  
df = pd.DataFrame(dict) 
#df["Sequence"] =  seqs
#df.to_pickle(plk)    
df  

Unnamed: 0,ID,Sequence,length
0,0,GRWNNAGKSEEHTSELQSRSDLVCRLLLEKKRGGGG,36
1,1,MHIGKIEEHTSELQSRSDLVCRHQLEKKNGGG,32
2,2,DVCSSDYPGFRFIPHRQVCLPKMAHLELSIPWRGSTEQPRRP,42
3,3,TTLGQTEEHTSELQSRSDLVCRLQLEKKKGG,31
4,4,YDGYPSSNTFKLTLTDLMSHSQFHSLN,28
...,...,...,...
1058124,1058124,AGHTEQIGRAHWARSEEHTGQDRKRTQ,28
1058125,1058125,SEEHTSELQSRSYLVSRLLLEKKKEG,26
1058126,1058126,SSLIASSAYQKWHTWSSRFRGAAQRSSRAVLPI,34
1058127,1058127,ASPVLLTKNGTLGALDSVARLNGAAAPSYLFKV,34


Remove Duplicate

In [7]:
df.drop_duplicates(subset=['Sequence'],inplace=True)
df

Unnamed: 0,ID,Sequence,length
0,0,GRWNNAGKSEEHTSELQSRSDLVCRLLLEKKRGGGG,36
1,1,MHIGKIEEHTSELQSRSDLVCRHQLEKKNGGG,32
2,2,DVCSSDYPGFRFIPHRQVCLPKMAHLELSIPWRGSTEQPRRP,42
3,3,TTLGQTEEHTSELQSRSDLVCRLQLEKKKGG,31
4,4,YDGYPSSNTFKLTLTDLMSHSQFHSLN,28
...,...,...,...
1058124,1058124,AGHTEQIGRAHWARSEEHTGQDRKRTQ,28
1058125,1058125,SEEHTSELQSRSYLVSRLLLEKKKEG,26
1058126,1058126,SSLIASSAYQKWHTWSSRFRGAAQRSSRAVLPI,34
1058127,1058127,ASPVLLTKNGTLGALDSVARLNGAAAPSYLFKV,34


In [8]:
#0-200000
#200000-579590
#579590-1159181
#df1 = df.iloc[0:100000, :]
#df1 = df.iloc[100000:200000, :]
# df1 = df.iloc[200000:300000, :]
# df1 = df.iloc[300000:400000, :]
# df1 = df.iloc[400000:500000, :]
# df1 = df.iloc[500000:600000, :]
# df1 = df.iloc[600000:700000, :]
# df1 = df.iloc[700000:800000, :]
# df1 = df.iloc[800000:900000, :]
# df1 = df.iloc[900000:1000000, :]
df1 = df.iloc[1000000:1159181, :]

#df2 = df.iloc[:, :]
df1

Unnamed: 0,ID,Sequence,length
1007174,1007174,PVEGGGPVATEGGDERDSPQSRVAWECSPKR,31
1007175,1007175,PLRAALPSNPTLRRVPLVAALRRYGPPTL,29
1007176,1007176,RRGLFRSWGCLLLKSNGGARRLANPGRTSGG,31
1007177,1007177,HRSFQYQMMPLNILWGTNLPFERLSP,27
1007178,1007178,NRDSWGHPYLIVRGEILGSIKDELMR,26
...,...,...,...
1058124,1058124,AGHTEQIGRAHWARSEEHTGQDRKRTQ,28
1058125,1058125,SEEHTSELQSRSYLVSRLLLEKKKEG,26
1058126,1058126,SSLIASSAYQKWHTWSSRFRGAAQRSSRAVLPI,34
1058127,1058127,ASPVLLTKNGTLGALDSVARLNGAAAPSYLFKV,34


In [9]:
_h_avg, h_final, c_final= get_reps(df1['Sequence'].to_list())

In [10]:
df1.drop(columns=['Sequence', 'length'], inplace=True)
df1['reps']=_h_avg.tolist()
df1.to_pickle("/mnt/vdb/phase3/TaiwaneseOolong.len20.1000000_1159181.pkl")
df1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,ID,reps
1007174,1007174,"[0.023784521967172623, -0.08420483767986298, -..."
1007175,1007175,"[0.023177096620202065, -0.027476267889142036, ..."
1007176,1007176,"[0.0200453232973814, -0.039769649505615234, -0..."
1007177,1007177,"[0.02572963386774063, -0.040419720113277435, -..."
1007178,1007178,"[0.024163803085684776, -0.09511740505695343, -..."
...,...,...
1058124,1058124,"[0.025300081819295883, -0.06294917315244675, -..."
1058125,1058125,"[0.03510252758860588, -0.06641663610935211, -0..."
1058126,1058126,"[0.019733581691980362, -0.03408155217766762, -..."
1058127,1058127,"[0.01968446560204029, -0.017883658409118652, 0..."


In [None]:
----- old method ----

In [13]:
_tmp_df.to_pickle("/mnt/vdb/phase3/TaiwaneseOolong.len20.100000_200000.pkl")

In [12]:
_tmp_df = df1
_tmp_df.drop(columns=['Sequence', 'length'], inplace=True)
_tmp_df['reps']=_h_avg.tolist()
_tmp_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,ID,reps
100127,100127,"[0.023528747260570526, -0.03029148280620575, -..."
100128,100128,"[0.02823188528418541, -0.03292207047343254, -0..."
100129,100129,"[0.021869858726859093, -0.04351050406694412, -..."
100130,100130,"[0.020454389974474907, -0.041200652718544006, ..."
100131,100131,"[0.029424959793686867, -0.0632018893957138, -0..."
...,...,...
200357,200357,"[0.024071287363767624, -0.00011292310227872804..."
200358,200358,"[0.022646920755505562, -0.03028733655810356, -..."
200359,200359,"[0.02165237069129944, -0.0707235261797905, 0.0..."
200360,200360,"[0.019866550341248512, -0.02926749922335148, -..."


## CD-100

In [5]:
fastas = "/home/ubuntu/data/bk_fasta/SRR5868581_2.len20.assembly.cd100.fasta"
with open(fastas) as fasta_file:  # Will close handle cleanly
    identifiers = []
    lengths = []
    seqs = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers.append(seq_record.id)
        # Remove leading and trailing characters from a string
        seqs.append(str(seq_record.seq.strip('*')))
        lengths.append(len(seq_record.seq))
# dictionary of lists  
dict = {'ID': identifiers, 'Sequence': seqs, 'length': lengths}  
df = pd.DataFrame(dict) 
#df["Sequence"] =  seqs
#df.to_pickle(plk)    
df  

Unnamed: 0,ID,Sequence,length
0,0,GRWNNAGKSEEHTSELQSRSDLVCRLLLEKKRGGGG,36
1,1,MHIGKIEEHTSELQSRSDLVCRHQLEKKNGGG,32
2,2,DVCSSDYPGFRFIPHRQVCLPKMAHLELSIPWRGSTEQPRRP,42
3,3,TTLGQTEEHTSELQSRSDLVCRLQLEKKKGG,31
4,4,YDGYPSSNTFKLTLTDLMSHSQFHSLN,28
...,...,...,...
954982,1058123,FFFFLLFSFFFLKKKKKKKKKKKKKKKNKR,30
954983,1058124,AGHTEQIGRAHWARSEEHTGQDRKRTQ,28
954984,1058125,SEEHTSELQSRSYLVSRLLLEKKKEG,26
954985,1058126,SSLIASSAYQKWHTWSSRFRGAAQRSSRAVLPI,34


In [6]:
# Setting my chunk size
chunk_size = 2000
# Assigning chunk numbers to rows
df['chunk'] = df['ID'].apply(lambda x: int(int(x)/ chunk_size))
# We don't want the 'chunk' and 'index' columns in the output
cols = [col for col in df.columns if col not in ['chunk']]
# groupby chunk and export each chunk to a different csv.
i = 0
for _, chunk in df.groupby('chunk'):
    chunk[cols].to_csv(f'/mnt/vdb/Taiwan/cd100/chunks/chunk{i}.csv', index=False)
    i += 1
print("complete")

complete


In [7]:
def createREPs(df, filename):
    _h_avg, h_final, c_final= get_reps(df['Sequence'].to_list())
    df.drop(columns=['Sequence'], inplace=True)
    df['reps']=_h_avg.tolist() # if there is a problem , might be here , possible solution is reindexing
    df.to_pickle(filename) # dont forget to change the file name to 0_613834

In [8]:
import glob
appended_reps = []
for infile in glob.glob("/mnt/vdb/Taiwan/cd100/reps/*.pkl"):
    # print(infile)
    appended_reps.append(infile)
print(len(appended_reps))

0


In [9]:
for infile in glob.glob("/mnt/vdb/Taiwan/cd100/chunks/*.csv"):
    #print("Read:"+infile)
    file_name = os.path.basename(infile)
    result="/mnt/vdb/Taiwan/cd100/reps/"+file_name.replace("csv", "pkl")
    if result in appended_reps :
        # print("found then skip : " , result)
        continue
    else:
        df = pd.read_csv(infile) 
        createREPs(df,result)
        print("Save:"+result)
        reset_device_memory()
        clear_jax_caches()
print("complete")

Save:/mnt/vdb/Taiwan/cd100/reps/chunk470.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk124.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk50.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk155.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk214.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk487.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk10.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk396.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk251.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk352.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk518.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk171.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk415.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk479.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk349.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk395.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk130.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk511.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk300.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk484.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk147.pkl
Save:/mnt/vdb/Taiwan/cd100/reps/chunk135.pkl
Save:/mnt/vd