In [6]:
import jax
# Global flag to set a specific platform, must be used at startup.
#jax.config.update('jax_platform_name', 'cpu')

import os

os.environ['XLA_PYTHON_CLIENT_PREALLOCATE']='False'
os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION']='1'
os.environ['XLA_PYTHON_CLIENT_ALLOCATOR']='platform'


In [1]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from jax_unirep import get_reps
  
from jax_unirep import evotune, fit
from jax_unirep.utils import dump_params

In [2]:
import gc
import jax 
def clear_jax_caches():
  """Utility to clear all the function caches in jax."""
  # main jit/pmap lu wrapped function caches - have to grab from closures
  jax.xla._xla_callable.__closure__[1].cell_contents.clear()
  jax.pxla.parallel_callable.__closure__[1].cell_contents.clear()
  # primitive callable caches
  jax.xla.xla_primitive_callable.cache_clear()
  jax.xla.primitive_computation.cache_clear()
  # jaxpr caches for control flow and reductions
  jax.lax.lax_control_flow._initial_style_jaxpr.cache_clear()
  jax.lax.lax_control_flow._fori_body_fun.cache_clear()
  jax.lax.lax._reduction_jaxpr.cache_clear()
  # these are trivial and only included for completeness sake
  jax.lax.lax.broadcast_shapes.cache_clear()
  jax.xla.xb.get_backend.cache_clear()
  jax.xla.xb.dtype_to_etype.cache_clear()
  jax.xla.xb.supported_numpy_dtypes.cache_clear()
    
def reset_device_memory(delete_objs=True):
    """Free all tracked DeviceArray memory and delete objects.

  Args:
    delete_objs: bool: whether to delete all live DeviceValues or just free.

  Returns:
    number of DeviceArrays that were manually freed.
  """
    dvals = (x for x in gc.get_objects() if isinstance(x, jax.xla.DeviceArray))
    n_deleted = 0
    for dv in dvals:
    
        if not isinstance(dv, jax.xla.DeviceConstant):
            try: 
                dv._check_if_deleted()  # pylint: disable=protected-access
                dv.device_buffer.delete()
                n_deleted += 1
            except:
                pass
        if delete_objs:

            del dv
    del dvals
    gc.collect()
    return n_deleted

In [3]:
def mergeDF(path,to_dir,file_name):
    appended_data = []
    for infile in glob.glob(path):
        #print(infile)
        data = pd.read_pickle(infile)
        # store DataFrame in list
        appended_data.append(data)
    result_path=to_dir+"/"+file_name
    print("Save:",result_path)
    appended_data = pd.concat(appended_data)
    appended_data.sort_values(by=['ID'], inplace=True)
    appended_data.to_pickle(result_path)
    return appended_data

In [4]:
def createREPs(df, filename):
    _h_avg, h_final, c_final= get_reps(df['Sequence'].to_list())
    #df.drop(columns=['Sequence'], inplace=True)
    df['reps']=_h_avg.tolist() # if there is a problem , might be here , possible solution is reindexing
    df.to_pickle(filename) # dont forget to change the file name to 0_613834

In [3]:
#db_path="AMPS_NonAMPs.ready.csv"
db_path="/mnt/vdb/thesis/AMP_NonAMPs.V5_C08_sim60.csv"

In [4]:
df=pd.read_csv(db_path,sep=',',header=0,quoting=csv.QUOTE_ALL)
df # Class 0= AMPs , 1=NonAMps

Unnamed: 0,ID,Sequence,length,class
0,UPI0006248D48,MTQNVKMGYIRFVVVGMIPWDGQTRTGARAARGQGQGGILKLGAYS...,138,0
1,UPI000195AE9F,MAEEAFDLWNECAKACVLDLKDGVRSSRMSVDPAIADTNGQGVLHY...,460,0
2,UPI000195AE9E,MAEEAFDLWNECAKACVLDLKDGVRSSRMSVDPAIADTNGQGVLHY...,539,0
3,UPI00085D47CF,PLIYLRLLRGQFAGGLRCMCIKWWSGKHPK,30,0
4,UPI00085E5E6F,ARKKAAKAARKKAAKAGGLRCMCIKWWSGKHPK,33,0
...,...,...,...,...
137384,UniRef50_U1L934,MADKQQVDFIRLPSGHRRYLATSIDSFTNAPAEVTR,36,1
137385,UniRef50_A0A6C2WUR6,MPITKHEIPLLECDSNPSAVIIPTHEGLQLSVSMLS,36,1
137386,UniRef50_A0A0H6DSM0,MGYPSMAAALHAAALNIALNIQLNISMRAMLLAFLE,36,1
137387,UniRef50_D3EJ07,MYRKQEKKLAILCFIVAAIMLYGAVKGYLRYMHHFG,36,1


In [5]:
# Setting my chunk size
chunk_size = 500
# Assigning chunk numbers to rows
df['chunk'] = df.index.map(lambda x: int(int(x)/ chunk_size))
# We don't want the 'chunk' and 'index' columns in the output
cols = [col for col in df.columns if col not in ['chunk']]
# groupby chunk and export each chunk to a different csv.
i = 0
for _, chunk in df.groupby('chunk'):
    chunk[cols].to_csv(f'/mnt/vdb/thesis/jax/chunk{i}.csv',sep=",", quotechar='"',index=False, quoting=csv.QUOTE_ALL) # <<-- change this line 
    i += 1
print("complete")

complete


In [7]:
import glob
appended_reps = []
for infile in glob.glob("/mnt/vdb/thesis/jax/*.pkl"): # <<-- change this line 
    # print(infile)
    appended_reps.append(infile)
print(len(appended_reps))

0


In [None]:
for infile in glob.glob("/mnt/vdb/thesis/jax/*.csv"): # <<-- change this line 
    #print("Read:"+infile)
    file_name = os.path.basename(infile)
    result="/mnt/vdb/thesis/jax/"+file_name.replace("csv", "pkl") # <<-- change this line 
    if result in appended_reps :
        # print("found then skip : " , result)
        continue
    else:
        df = pd.read_csv(infile) 
        createREPs(df,result)
        print("Save:"+result)
        reset_device_memory()
        clear_jax_caches()
print("complete")

Save:/mnt/vdb/thesis/jax/chunk124.pkl
Save:/mnt/vdb/thesis/jax/chunk50.pkl
Save:/mnt/vdb/thesis/jax/chunk155.pkl
Save:/mnt/vdb/thesis/jax/chunk214.pkl
Save:/mnt/vdb/thesis/jax/chunk10.pkl
Save:/mnt/vdb/thesis/jax/chunk251.pkl
Save:/mnt/vdb/thesis/jax/chunk171.pkl
Save:/mnt/vdb/thesis/jax/chunk130.pkl
Save:/mnt/vdb/thesis/jax/chunk147.pkl
Save:/mnt/vdb/thesis/jax/chunk135.pkl
Save:/mnt/vdb/thesis/jax/chunk245.pkl
Save:/mnt/vdb/thesis/jax/chunk73.pkl
Save:/mnt/vdb/thesis/jax/chunk123.pkl
Save:/mnt/vdb/thesis/jax/chunk133.pkl
Save:/mnt/vdb/thesis/jax/chunk49.pkl
Save:/mnt/vdb/thesis/jax/chunk168.pkl
Save:/mnt/vdb/thesis/jax/chunk175.pkl
Save:/mnt/vdb/thesis/jax/chunk17.pkl
Save:/mnt/vdb/thesis/jax/chunk150.pkl
Save:/mnt/vdb/thesis/jax/chunk264.pkl
Save:/mnt/vdb/thesis/jax/chunk151.pkl
Save:/mnt/vdb/thesis/jax/chunk102.pkl
Save:/mnt/vdb/thesis/jax/chunk36.pkl
Save:/mnt/vdb/thesis/jax/chunk131.pkl
Save:/mnt/vdb/thesis/jax/chunk200.pkl
Save:/mnt/vdb/thesis/jax/chunk97.pkl
Save:/mnt/vdb/thesi

In [11]:
reset_device_memory()
clear_jax_caches()

In [13]:
# write result
final_df = mergeDF("/mnt/vdb/thesis/jax/*.pkl","/mnt/vdb/thesis/jax","AMPNonAMP.V5_C08_sim60.reps") # <<-- change this line 
final_df

Save: /mnt/vdb/thesis/jax/AMPNonAMP.V5_C08_sim60.reps


Unnamed: 0,ID,length,class,reps
210,0_antitbpred|antitbpred,33,0,"[0.020596183836460114, 0.05145370587706566, 0...."
371,0_peptideDB.anti|peptideDB.anti,148,0,"[0.006680206395685673, -0.09558607637882233, 0..."
420,1000_pos_train_ds3|pos_train_ds3,86,0,"[0.010705526918172836, 0.00504455529153347, 0...."
168,10023_dbaasp|dbaasp_peptides,36,0,"[0.002072900300845504, -0.07690200954675674, -..."
344,"1003,1011,1019,1027,1035|CancerPPD_l_natural",20,0,"[0.02989775314927101, -0.004465700127184391, -..."
...,...,...,...,...
471,tagenome__1003787_1003787.scaffolds.fasta_scaf...,57,0,"[0.013467248529195786, -0.135259211063385, 0.0..."
447,tagenome__1003787_1003787.scaffolds.fasta_scaf...,47,0,"[0.01930670626461506, -0.030069854110479355, -..."
5,tagenome__1003787_1003787.scaffolds.fasta_scaf...,60,0,"[0.008607540279626846, -0.05736019089818001, 0..."
477,tagenome__1003787_1003787.scaffolds.fasta_scaf...,58,0,"[0.014597401954233646, -0.13994131982326508, 0..."


In [14]:
final_df[final_df["class"] == 1]

Unnamed: 0,ID,length,class,reps
84,UniRef50_A0A009F8Z1,130,1,"[0.006538190878927708, -0.03369353711605072, 0..."
257,UniRef50_A0A009FV03,37,1,"[0.015707001090049744, -0.05971147492527962, 0..."
285,UniRef50_A0A009GB73,239,1,"[0.009423937648534775, 0.0019337327685207129, ..."
289,UniRef50_A0A009I9E9,99,1,"[0.007063737139105797, -0.0181045513600111, 0...."
402,UniRef50_A0A009IQJ3,210,1,"[9.893722017295659e-05, -0.02460651658475399, ..."
...,...,...,...,...
300,UniRef50_Z9JK72,160,1,"[0.010461339727044106, 0.01646239496767521, 0...."
227,UniRef50_Z9JTC3,1147,1,"[0.011528722010552883, -0.09852015227079391, 0..."
22,UniRef50_Z9JUS5,131,1,"[0.007241042796522379, -0.028920277953147888, ..."
25,UniRef50_Z9JXB0,78,1,"[0.012649418786168098, -0.024078544229269028, ..."


# Trainingset 2

In [7]:
# set 9
db_path = "/mnt/vdb/thesis/trainingset2/set9/AMP_NonAMPs.csv"
df=pd.read_csv(db_path,sep=',',header=0,quoting=csv.QUOTE_ALL)
# Setting my chunk size
chunk_size = 500
# Assigning chunk numbers to rows
df['chunk'] = df.index.map(lambda x: int(int(x)/ chunk_size))
# We don't want the 'chunk' and 'index' columns in the output
cols = [col for col in df.columns if col not in ['chunk']]
# groupby chunk and export each chunk to a different csv.
i = 0
for _, chunk in df.groupby('chunk'):
    chunk[cols].to_csv(f'/mnt/vdb/thesis/trainingset2/set9/jax/chunk{i}.csv',sep=",", quotechar='"',index=False, quoting=csv.QUOTE_ALL) # <<-- change this line 
    i += 1
print("complete")

import glob
appended_reps = []
for infile in glob.glob("/mnt/vdb/thesis/trainingset2/set9/jax/*.pkl"): # <<-- change this line 
    # print(infile)
    appended_reps.append(infile)
print(len(appended_reps))

for infile in glob.glob("/mnt/vdb/thesis/trainingset2/set9/jax/*.csv"): # <<-- change this line 
    #print("Read:"+infile)
    file_name = os.path.basename(infile)
    result="/mnt/vdb/thesis/trainingset2/set9/jax/"+file_name.replace("csv", "pkl") # <<-- change this line 
    if result in appended_reps :
        # print("found then skip : " , result)
        continue
    else:
        df = pd.read_csv(infile) 
        createREPs(df,result)
        print("Save:"+result)
        reset_device_memory()
        clear_jax_caches()
print("complete")


complete
0
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk124.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk50.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk10.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk130.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk135.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk73.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk123.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk133.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk49.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk17.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk102.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk36.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk131.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk97.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk122.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk30.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chunk55.pkl
Save:/mnt/vdb/thesis/trainingset2/set9/jax/chu

In [None]:
# write result
final_df = mergeDF("/mnt/vdb/thesis/trainingset2/set9/jax/*.pkl","/mnt/vdb/thesis/trainingset2/set9/jax","AMPNonAMP.reps") # <<-- change this line 
final_df

Save: /mnt/vdb/thesis/trainingset2/set9/jax/AMPNonAMP.reps


In [6]:
# Set 1
df=pd.read_pickle("../../datasets/thesis/trainingset2/set1/final_balance_set_1.pkl")
# Setting my chunk size
chunk_size = 500
# Assigning chunk numbers to rows
df['chunk'] = df.index.map(lambda x: int(int(x)/ chunk_size))
# We don't want the 'chunk' and 'index' columns in the output
cols = [col for col in df.columns if col not in ['chunk']]
# groupby chunk and export each chunk to a different csv.
i = 0
for _, chunk in df.groupby('chunk'):
    chunk[cols].to_csv(f'/mnt/vdb/thesis/trainingset2/set1/jax/chunk{i}.csv',sep=",", quotechar='"',index=False, quoting=csv.QUOTE_ALL) # <<-- change this line 
    i += 1
print(" chunks complete")

import glob
appended_reps = []
for infile in glob.glob("/mnt/vdb/thesis/trainingset2/set1/jax/*.pkl"): # <<-- change this line 
    # print(infile)
    appended_reps.append(infile)
print(len(appended_reps))

for infile in glob.glob("/mnt/vdb/thesis/trainingset2/set1/jax/*.csv"): # <<-- change this line 
    #print("Read:"+infile)
    file_name = os.path.basename(infile)
    result="/mnt/vdb/thesis/trainingset2/set1/jax/"+file_name.replace("csv", "pkl") # <<-- change this line 
    if result in appended_reps :
        # print("found then skip : " , result)
        continue
    else:
        df = pd.read_csv(infile) 
        createREPs(df,result)
        print("Save:"+result)
        reset_device_memory()
        clear_jax_caches()
print("JAXUnirep complete")

# write result
final_df = mergeDF("/mnt/vdb/thesis/trainingset2/set1/jax/*.pkl","/mnt/vdb/thesis/trainingset2/set1/jax","AMPNonAMP.reps") # <<-- change this line 
final_df

 chunks complete
0
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk124.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk50.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk155.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk10.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk171.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk130.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk147.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk135.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk73.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk123.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk133.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk49.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk168.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk175.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk17.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk150.pkl
Save:/mnt/vdb/thesis/trainingset2/set1/jax/chunk151.pkl
Save:/mnt/vdb/thesis/trainingset2/

Unnamed: 0,ID,length,class,reps
270,"+4E_Peptide,V681_V13K,S4E,S11E,T15E,",26,0,"[0.013803755864501, -0.04868223890662193, 0.00..."
271,"+4S_Peptide,V681_V13K,K1S,K10S,K14S,",26,0,"[0.025307096540927887, -0.01878233812749386, -..."
272,",linocinCFP29_homolog,",253,0,"[0.007707999553531408, 0.06123709678649902, 0...."
279,"-10,A11,",10,0,"[0.06724510341882706, 0.021797023713588715, -0..."
280,"-11,A12,",9,0,"[0.07372649013996124, 0.02437029778957367, -0...."
...,...,...,...,...
295,"αs1-casein_f90_95,",6,0,"[0.08635857701301575, 0.0455104261636734, -0.0..."
298,"β-Casomorphins_5_f60_64,",5,0,"[0.10100759565830231, 0.035976558923721313, -0..."
297,"β126-145,Bovine_hemoglobin_beta_chain_126-145,",20,0,"[0.03266279771924019, -0.05012349784374237, -0..."
300,"∆1,2Mac,Maculatin_1.1.1,",19,0,"[0.03144867345690727, 0.002185762394219637, -0..."


In [10]:
# set 3

df=pd.read_pickle("../../datasets/thesis/trainingset2/set3/final_balance_set_3.pkl")
# Setting my chunk size
chunk_size = 500
# Assigning chunk numbers to rows
df['chunk'] = df.index.map(lambda x: int(int(x)/ chunk_size))
# We don't want the 'chunk' and 'index' columns in the output
cols = [col for col in df.columns if col not in ['chunk']]
# groupby chunk and export each chunk to a different csv.
i = 0
for _, chunk in df.groupby('chunk'):
    chunk[cols].to_csv(f'/mnt/vdb/thesis/trainingset2/set3/jax/chunk{i}.csv',sep=",", quotechar='"',index=False, quoting=csv.QUOTE_ALL) # <<-- change this line 
    i += 1
print(" chunks complete")

import glob
appended_reps = []
for infile in glob.glob("/mnt/vdb/thesis/trainingset2/set3/jax/*.pkl"): # <<-- change this line 
    # print(infile)
    appended_reps.append(infile)
print(len(appended_reps))

for infile in glob.glob("/mnt/vdb/thesis/trainingset2/set3/jax/*.csv"): # <<-- change this line 
    #print("Read:"+infile)
    file_name = os.path.basename(infile)
    result="/mnt/vdb/thesis/trainingset2/set3/jax/"+file_name.replace("csv", "pkl") # <<-- change this line 
    if result in appended_reps :
        # print("found then skip : " , result)
        continue
    else:
        df = pd.read_csv(infile) 
        createREPs(df,result)
        print("Save:"+result)
        reset_device_memory()
        clear_jax_caches()
print("JAXUnirep complete")

# write result
final_df = mergeDF("/mnt/vdb/thesis/trainingset2/set3/jax/*.pkl","/mnt/vdb/thesis/trainingset2/set3/jax","AMPNonAMP.reps") # <<-- change this line 
final_df

 chunks complete
0
Save:/mnt/vdb/thesis/trainingset2/set3/jax/chunk10.pkl
Save:/mnt/vdb/thesis/trainingset2/set3/jax/chunk8.pkl
Save:/mnt/vdb/thesis/trainingset2/set3/jax/chunk5.pkl
Save:/mnt/vdb/thesis/trainingset2/set3/jax/chunk6.pkl
Save:/mnt/vdb/thesis/trainingset2/set3/jax/chunk2.pkl
Save:/mnt/vdb/thesis/trainingset2/set3/jax/chunk3.pkl
Save:/mnt/vdb/thesis/trainingset2/set3/jax/chunk7.pkl
Save:/mnt/vdb/thesis/trainingset2/set3/jax/chunk9.pkl
Save:/mnt/vdb/thesis/trainingset2/set3/jax/chunk1.pkl
Save:/mnt/vdb/thesis/trainingset2/set3/jax/chunk0.pkl
Save:/mnt/vdb/thesis/trainingset2/set3/jax/chunk4.pkl
JAXUnirep complete
Save: /mnt/vdb/thesis/trainingset2/set3/jax/AMPNonAMP.reps


Unnamed: 0,ID,length,class,reps
57,0_antitbpred_pos|antitbpred_pos,33,0,"[0.021885784342885017, 0.06677407771348953, 0...."
377,1006_amp_otherthan_antifungal,86,0,"[0.004164760932326317, -0.014360113069415092, ..."
411,1009_amp_otherthan_antifungal,86,0,"[0.009300129488110542, -0.006003316026180983, ..."
25,1012_amp_otherthan_antifungal,86,0,"[0.008035180158913136, -0.008067233487963676, ..."
30,1013_amp_otherthan_antifungal,86,0,"[0.010771277360618114, -0.013503030873835087, ..."
...,...,...,...,...
141,"r8-BadBH3,",34,0,"[0.019744208082556725, -0.04166042432188988, -..."
142,"rChemokine_CK11,",86,0,"[0.010031205601990223, -0.07044699788093567, 0..."
143,"rHispidalin,",61,0,"[0.012876027263700962, -0.08553948998451233, -..."
144,"rcrustinPm4-1,",251,0,"[0.003176897531375289, -0.03680146858096123, -..."


In [13]:
# set 4
df=pd.read_pickle("../../datasets/thesis/trainingset2/set4/final_balance_set_4.pkl")
# Setting my chunk size
chunk_size = 500
# Assigning chunk numbers to rows
df['chunk'] = df.index.map(lambda x: int(int(x)/ chunk_size))
# We don't want the 'chunk' and 'index' columns in the output
cols = [col for col in df.columns if col not in ['chunk']]
# groupby chunk and export each chunk to a different csv.
i = 0
for _, chunk in df.groupby('chunk'):
    chunk[cols].to_csv(f'/mnt/vdb/thesis/trainingset2/set4/jax/chunk{i}.csv',sep=",", quotechar='"',index=False, quoting=csv.QUOTE_ALL) # <<-- change this line 
    i += 1
print(" chunks complete")

import glob
appended_reps = []
for infile in glob.glob("/mnt/vdb/thesis/trainingset2/set4/jax/*.pkl"): # <<-- change this line 
    # print(infile)
    appended_reps.append(infile)
print(len(appended_reps))

for infile in glob.glob("/mnt/vdb/thesis/trainingset2/set4/jax/*.csv"): # <<-- change this line 
    #print("Read:"+infile)
    file_name = os.path.basename(infile)
    result="/mnt/vdb/thesis/trainingset2/set4/jax/"+file_name.replace("csv", "pkl") # <<-- change this line 
    if result in appended_reps :
        # print("found then skip : " , result)
        continue
    else:
        df = pd.read_csv(infile) 
        createREPs(df,result)
        print("Save:"+result)
        reset_device_memory()
        clear_jax_caches()
print("JAXUnirep complete")

# write result
final_df = mergeDF("/mnt/vdb/thesis/trainingset2/set4/jax/*.pkl","/mnt/vdb/thesis/trainingset2/set4/jax","AMPNonAMP.reps") # <<-- change this line 
final_df

 chunks complete
0
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk50.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk10.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk73.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk49.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk17.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk36.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk30.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk55.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk23.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk8.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk56.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk27.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk43.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk41.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk11.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk69.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chunk51.pkl
Save:/mnt/vdb/thesis/trainingset2/set4/jax/chun

Unnamed: 0,ID,length,class,reps
427,",linocinCFP29_homolog,",253,0,"[0.007708001881837845, 0.061237093061208725, 0..."
428,"-5_Peptide,V681_V13K,K1E,K3E,K7E,K10E,K14E,K22E,",26,0,"[0.02458917908370495, -0.03119553066790104, -0..."
238,02249|Teixobactin,11,0,"[0.05135253816843033, 0.01816537044942379, -0...."
239,03037|Pv-Def,45,0,"[0.01882702298462391, -0.06651552766561508, 0...."
240,03107|SpCrus2,147,0,"[0.007098822854459286, -0.07174162566661835, 0..."
...,...,...,...,...
423,"vCPP_2319,Torque_teno_douroucouli_vírus_Capsid...",20,0,"[0.0310472771525383, -0.020746728405356407, -0..."
424,"vif_121-135,",15,0,"[0.03935248777270317, -0.032293274998664856, -..."
425,"vif_149-163,",15,0,"[0.03780689835548401, -0.00153274554759264, -0..."
426,"vif_157-171,",15,0,"[0.054227832704782486, -0.04932979494333267, -..."


In [None]:
# Benchmark

# Set 1
df=pd.read_pickle("../../datasets/thesis/trainingset2/benchmark_set/benchmark_set_1777.pkl")
# Setting my chunk size
chunk_size = 500
# Assigning chunk numbers to rows
df['chunk'] = df.index.map(lambda x: int(int(x)/ chunk_size))
# We don't want the 'chunk' and 'index' columns in the output
cols = [col for col in df.columns if col not in ['chunk']]
# groupby chunk and export each chunk to a different csv.
i = 0
for _, chunk in df.groupby('chunk'):
    chunk[cols].to_csv(f'/mnt/vdb/thesis/trainingset2/benchmark_set/jax/chunk{i}.csv',sep=",", quotechar='"',index=False, quoting=csv.QUOTE_ALL) # <<-- change this line 
    i += 1
print(" chunks complete")

import glob
appended_reps = []
for infile in glob.glob("/mnt/vdb/thesis/trainingset2/benchmark_set/jax/*.pkl"): # <<-- change this line 
    # print(infile)
    appended_reps.append(infile)
print(len(appended_reps))

for infile in glob.glob("/mnt/vdb/thesis/trainingset2/benchmark_set/jax/*.csv"): # <<-- change this line 
    #print("Read:"+infile)
    file_name = os.path.basename(infile)
    result="/mnt/vdb/thesis/trainingset2/benchmark_set/jax/"+file_name.replace("csv", "pkl") # <<-- change this line 
    if result in appended_reps :
        # print("found then skip : " , result)
        continue
    else:
        df = pd.read_csv(infile) 
        createREPs(df,result)
        print("Save:"+result)
        reset_device_memory()
        clear_jax_caches()
print("JAXUnirep complete")

# write result
final_df = mergeDF("/mnt/vdb/thesis/trainingset2/benchmark_set/jax/*.pkl","/mnt/vdb/thesis/trainingset2/benchmark_set/jax","AMPNonAMP.benchmark_set.reps") # <<-- change this line 
final_df

In [9]:

# write result
final_df = mergeDF("/mnt/vdb/thesis/trainingset2/benchmark_set/jax/*.pkl","/mnt/vdb/thesis/trainingset2/benchmark_set/jax","AMPNonAMP.benchmark_set.reps") # <<-- change this line 
final_df

Save: /mnt/vdb/thesis/trainingset2/benchmark_set/jax/AMPNonAMP.benchmark_set.reps


Unnamed: 0,ID,length,class,reps
419,AP00001,33,0,"[0.025133414193987846, -0.04609730467200279, 0..."
394,AP00002,34,0,"[0.02185128442943096, -0.04255806654691696, -0..."
212,AP00004,49,0,"[0.015953853726387024, -0.12411346286535263, 0..."
166,AP00005,34,0,"[0.024810995906591415, -0.021290557458996773, ..."
149,AP00006,18,0,"[0.03526247665286064, -0.06658440828323364, -0..."
...,...,...,...,...
40,UniRef50_Q9ZNV8,24,1,"[0.02870078943669796, -0.14460773766040802, 0...."
44,UniRef50_Q9ZSK2,24,1,"[0.026792025193572044, -0.04943835735321045, 0..."
152,UniRef50_Q9ZTK5,33,1,"[0.021517498418688774, -0.06481127440929413, 0..."
431,UniRef50_Q9ZVC2,29,1,"[0.03130599111318588, -0.07680534571409225, -0..."


In [7]:
test = pd.read_pickle("../../datasets/thesis/trainingset2/benchmark_set/benchmark_set_1777.pkl")
test[test["ID"] == "AP00001"]

Unnamed: 0,ID,Sequence,length,class
919,AP00001,GLWSKIKEVGKEAAKAAAKAAGKAALGAVSEAV,33,0


In [8]:
_h_avg, h_final, c_final= get_reps("GLWSKIKEVGKEAAKAAAKAAGKAALGAVSEAV")

In [9]:
_h_avg

array([[ 0.02513341, -0.04609731,  0.00243294, ...,  0.01681087,
         0.22015527,  0.13304605]], dtype=float32)

## Split train and test set

In [None]:
# convert array value from  single column into separate column
df = pd.concat([final_df.pop('reps').apply(pd.Series), final_df['class'],final_df['ID'],final_df['length']], axis=1)
df
#df =final_df[["reps","class"]]
#df
#df_new = df.reps.apply(pd.Series).astype(np.float64)
#df_new['class'] = df['class']
#df_new

In [12]:
X= np.array(final_df['reps'].to_list())
y= np.array(final_df['class'].to_list())

In [13]:
X.shape

(254036, 1900)

In [15]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

# Implement Toy Model (RF)

In [16]:
from sklearn.ensemble import RandomForestClassifier 

classifier = RandomForestClassifier ( random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix

class_names = ['AMPs', 'NonAMPs']

disp = plot_confusion_matrix(classifier, X_test, y_test,
                            display_labels = class_names,
                            cmap=plt.cm.Blues, xticks_rotation='vertical')

disp.ax_.set_title(" Confusion Matrix")

print(disp.confusion_matrix)
plt.grid(False)
plt.show()