# Examen d'un modèle Basenji

In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf

In [2]:
from basenji import dataset
from basenji import seqnn
from basenji import trainer

In [4]:
params_file = "/home/bureau/projects/def-bureau/basenji/manuscripts/akita/params.json"
model_dir = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/train_out/"
model_file  = model_dir+'model_best.h5'

In [5]:
#data_stats_file = "/home/bureau/projects/def-bureau/bureau/distiller/results_Neu10000/data/1m/statistics.json"
data_stats_file = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/statistics.json"

In [6]:
#tfr_train_full = "/home/bureau/projects/def-bureau/bureau/distiller/results_Neu10000/data/1m/tfrecords/train-*.tfr"
#tfr_eval_full = "/home/bureau/projects/def-bureau/bureau/distiller/results_Neu10000/data/1m/tfrecords/valid-*.tfr"
tfr_train_full = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/train-*.tfr"
tfr_eval_full = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/valid-*.tfr"

## Chargement des paramètres du modèle

In [5]:
with open(params_file) as params_open:
    params = json.load(params_open)
params_model = params['model']
params_train = params['train']
#params_model['batch_norm'] = False
params_model['head_hic'][-1]['units'] =1

## Chargement des stats du modèle

In [8]:
with open(data_stats_file) as data_stats_open:
    data_stats = json.load(data_stats_open)
data_stats

{'num_targets': 1,
 'train_seqs': 7617,
 'valid_seqs': 6676,
 'test_seqs': 6667,
 'seq_length': 1048576,
 'pool_width': 2048,
 'crop_bp': 65536,
 'diagonal_offset': 2,
 'target_length': 99681}

## Chargement des données

In [9]:
train_data = dataset.SeqDataset(tfr_train_full,
    params_train['batch_size'],
    data_stats['seq_length'],
    data_stats['target_length'],
    tf.estimator.ModeKeys.TRAIN)
eval_data = dataset.SeqDataset(tfr_eval_full,
    params_train['batch_size'],
    data_stats['seq_length'],
    data_stats['target_length'],
    tf.estimator.ModeKeys.EVAL)


/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/train-*.tfr has 7617 sequences with 1/1 targets
/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/valid-*.tfr has 6676 sequences with 1/1 targets


## Initialisation du modèle

In [6]:
seqnn_model = seqnn.SeqNN(params_model)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sequence (InputLayer)           [(None, 1048576, 4)] 0                                            
__________________________________________________________________________________________________
stochastic_reverse_complement ( ((None, 1048576, 4), 0           sequence[0][0]                   
__________________________________________________________________________________________________
stochastic_shift (StochasticShi (None, 1048576, 4)   0           stochastic_reverse_complement[0][
__________________________________________________________________________________________________
re_lu (ReLU)                    (None, 1048576, 4)   0           stochastic_shift[0][0]           
____________________________________________________________________________________________

In [11]:
seqnn_model.models[0].layers[len(seqnn_model.models[0].layers)-2].__dict__

{'_self_setattr_tracking': True,
 '_instrumented_keras_api': True,
 '_instrumented_keras_layer_class': True,
 '_instrumented_keras_model_class': False,
 '_trainable': True,
 '_stateful': False,
 '_build_input_shape': TensorShape([None, 99681, 48]),
 '_saved_model_inputs_spec': None,
 '_input_spec': InputSpec(min_ndim=2, axes={-1: 48}),
 '_name': 'dense',
 '_activity_regularizer': None,
 '_trainable_weights': [<tf.Variable 'dense/kernel:0' shape=(48, 1) dtype=float32, numpy=
  array([[ 0.06079762],
         [ 0.18818447],
         [ 0.08463026],
         [-0.20812461],
         [ 0.45833126],
         [-0.04626504],
         [-0.27674848],
         [-0.0827073 ],
         [-0.10453969],
         [-0.10169736],
         [ 0.07369819],
         [-0.06826399],
         [ 0.06612451],
         [ 0.12863523],
         [-0.07848687],
         [-0.18903378],
         [-0.17393164],
         [-0.17700341],
         [-0.01280877],
         [ 0.02550466],
         [ 0.00919479],
         [ 0.2697

In [7]:
seqnn_model.models[0].layers[len(seqnn_model.models[0].layers)-2].trainable_weights

[<tf.Variable 'dense/kernel:0' shape=(48, 1) dtype=float32, numpy=
 array([[ 0.02587332],
        [ 0.28899696],
        [ 0.2743979 ],
        [-0.0704336 ],
        [-0.08833429],
        [ 0.02690854],
        [ 0.21806619],
        [ 0.28564113],
        [-0.2020018 ],
        [-0.10664453],
        [-0.05862586],
        [-0.17276472],
        [ 0.17987098],
        [-0.01123997],
        [ 0.10424743],
        [ 0.01360302],
        [-0.27034143],
        [ 0.43729335],
        [-0.0281564 ],
        [ 0.33010992],
        [-0.12836683],
        [-0.11615153],
        [ 0.19048639],
        [ 0.16660239],
        [-0.07904723],
        [-0.03235196],
        [ 0.32403794],
        [-0.21663392],
        [ 0.00646565],
        [-0.3462613 ],
        [-0.3046713 ],
        [ 0.0690897 ],
        [ 0.04827685],
        [ 0.13917689],
        [ 0.15026335],
        [-0.27925265],
        [-0.07275875],
        [-0.3747679 ],
        [-0.06620637],
        [ 0.06528403],
        [ 0.0

In [33]:
params['model'] = params_model
display(params['model'])

{'seq_length': 1048576,
 'target_length': 512,
 'target_crop': 32,
 'diagonal_offset': 2,
 'augment_rc': True,
 'augment_shift': 11,
 'activation': 'relu',
 'batch_norm': True,
 'bn_momentum': 0.9265,
 'trunk': [{'filters': 96, 'kernel_size': 11, 'pool_size': 2},
  {'filters_init': 96,
   'filters_mult': 1.0,
   'kernel_size': 5,
   'pool_size': 2,
   'repeat': 10},
  {'filters': 48, 'rate_mult': 1.75, 'repeat': 8, 'dropout': 0.4},
  {'filters': 64, 'kernel_size': 5}],
 'head_hic': [{'operation': 'mean'},
  {},
  {'filters': 48, 'kernel_size': 3},
  {},
  {'filters': 24,
   'kernel_size': 3,
   'rate_mult': 1.75,
   'repeat': 6,
   'dropout': 0.1},
  {'cropping': 32},
  {'diagonal_offset': 2},
  {'units': 1, 'activation': 'linear'}]}

In [34]:
# Écriture du fichier de paramètres
with open('/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/params_tutorial.json','w') as params_tutorial_file:
    json.dump(params,params_tutorial_file) 

In [8]:
seqnn_model.restore(model_file)

In [9]:
seqnn_model.models[0].layers[len(seqnn_model.models[0].layers)-2].trainable_weights

[<tf.Variable 'dense/kernel:0' shape=(48, 1) dtype=float32, numpy=
 array([[ 1.4580915 ],
        [ 0.23465092],
        [ 0.05835523],
        [-0.6239604 ],
        [ 0.34154844],
        [-0.16338602],
        [ 1.2967775 ],
        [-0.42430592],
        [ 1.2869276 ],
        [ 0.03741875],
        [ 0.76618195],
        [ 0.75870216],
        [ 0.19104275],
        [-0.7300541 ],
        [-0.6849625 ],
        [-0.01603766],
        [ 0.55600345],
        [-3.1726544 ],
        [-0.03922657],
        [-1.273954  ],
        [-0.11047602],
        [ 0.16599712],
        [ 0.08342776],
        [ 1.9507604 ],
        [ 0.16429499],
        [-0.30267197],
        [-1.0408251 ],
        [ 0.0262637 ],
        [ 0.36577496],
        [ 0.19783096],
        [-1.0037417 ],
        [-0.06746949],
        [-0.02523711],
        [ 0.0058255 ],
        [-1.6376753 ],
        [ 0.46741402],
        [-0.60394293],
        [-1.542865  ],
        [-0.31098443],
        [-0.9531344 ],
        [ 0.0

In [10]:
seqnn_model.models[0].layers[4].trainable_weights

[<tf.Variable 'conv1d/kernel:0' shape=(11, 4, 96) dtype=float32, numpy=
 array([[[ 0.09355764,  0.08427623,  0.12153016, ..., -0.15557124,
          -0.24185126, -0.08866961],
         [-0.39165616,  0.22918619,  0.17072426, ..., -0.23238999,
           0.1473212 ,  0.1674815 ],
         [ 0.23334314,  0.13728172,  0.15159912, ..., -0.14496087,
           0.22836079, -0.45533156],
         [ 0.04052   ,  0.47208974,  0.40900332, ...,  0.18603727,
           0.02523968,  0.31608298]],
 
        [[ 0.13254096,  0.3413415 ,  0.03291266, ..., -0.21221818,
           0.03376405,  0.26585445],
         [-0.08990127, -0.04370791,  0.20237586, ..., -0.1671591 ,
           0.27642018,  0.12923117],
         [ 0.34807304,  0.05613009, -0.01062612, ...,  0.32746392,
          -0.12286341, -0.11060675],
         [-0.20968954,  0.04918202,  0.04545894, ..., -0.07714995,
           0.40476233, -0.1825966 ]],
 
        [[-0.24926293,  0.38959837,  0.08361185, ...,  0.32659858,
          -0.27865896, 

In [17]:
# Modèle estimé originellement sur 5 lignées cellulaires
model_orig_dir = "/home/bureau/projects/def-bureau/bureau/basenji/manuscripts/akita/"
model_orig_file  = model_orig_dir+'model_best.h5'

In [18]:
# Tel qu'attendu, problème à régler
seqnn_model.restore(model_orig_file)

ValueError: Cannot assign to variable dense/kernel:0 due to variable shape (48, 1) and value shape (48, 5) are incompatible