# Examen d'un modèle Basenji

In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf

In [3]:
from basenji import dataset
from basenji import seqnn
from basenji import trainer

In [3]:
params_file = "/home/bureau/projects/def-bureau/basenji/manuscripts/akita/params.json"
#params_file = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/params1.json"
model_dir = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/train_out/"
model_file  = model_dir+'model_best.h5'

In [5]:
#data_stats_file = "/home/bureau/projects/def-bureau/bureau/distiller/results_Neu10000/data/1m/statistics.json"
data_stats_file = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/statistics.json"

In [6]:
#tfr_train_full = "/home/bureau/projects/def-bureau/bureau/distiller/results_Neu10000/data/1m/tfrecords/train-*.tfr"
#tfr_eval_full = "/home/bureau/projects/def-bureau/bureau/distiller/results_Neu10000/data/1m/tfrecords/valid-*.tfr"
tfr_train_full = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/train-*.tfr"
tfr_eval_full = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/valid-*.tfr"

## Chargement des paramètres du modèle

In [4]:
with open(params_file) as params_open:
    params = json.load(params_open)
params_model = params['model']
params_train = params['train']
params_model['head_hic'][-1]['units'] =1
params_model['head_hic'][-1]['kernel_initializer'] ='he_normal'

In [8]:
params_model['head_hic']

[{'name': 'one_to_two', 'operation': 'mean'},
 {'name': 'concat_dist_2d'},
 {'name': 'conv_block_2d', 'filters': 48, 'kernel_size': 3},
 {'name': 'symmetrize_2d'},
 {'name': 'dilated_residual_2d',
  'filters': 24,
  'kernel_size': 3,
  'rate_mult': 1.75,
  'repeat': 6,
  'dropout': 0.1},
 {'name': 'cropping_2d', 'cropping': 32},
 {'name': 'upper_tri', 'diagonal_offset': 2},
 {'name': 'final',
  'units': 1,
  'activation': 'linear',
  'kernel_initializer': 'he_normal'}]

## Chargement des stats du modèle

In [9]:
with open(data_stats_file) as data_stats_open:
    data_stats = json.load(data_stats_open)
data_stats

{'num_targets': 1,
 'train_seqs': 7617,
 'valid_seqs': 6676,
 'test_seqs': 6667,
 'seq_length': 1048576,
 'pool_width': 2048,
 'crop_bp': 65536,
 'diagonal_offset': 2,
 'target_length': 99681}

## Chargement des données

In [9]:
train_data = dataset.SeqDataset(tfr_train_full,
    params_train['batch_size'],
    data_stats['seq_length'],
    data_stats['target_length'],
    tf.estimator.ModeKeys.TRAIN)
eval_data = dataset.SeqDataset(tfr_eval_full,
    params_train['batch_size'],
    data_stats['seq_length'],
    data_stats['target_length'],
    tf.estimator.ModeKeys.EVAL)


/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/train-*.tfr has 7617 sequences with 1/1 targets
/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/valid-*.tfr has 6676 sequences with 1/1 targets


## Initialisation du modèle

In [10]:
seqnn_model = seqnn.SeqNN(params_model)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sequence (InputLayer)           [(None, 1048576, 4)] 0                                            
__________________________________________________________________________________________________
stochastic_reverse_complement ( ((None, 1048576, 4), 0           sequence[0][0]                   
__________________________________________________________________________________________________
stochastic_shift (StochasticShi (None, 1048576, 4)   0           stochastic_reverse_complement[0][
__________________________________________________________________________________________________
re_lu (ReLU)                    (None, 1048576, 4)   0           stochastic_shift[0][0]           
____________________________________________________________________________________________

In [11]:
seqnn_model.models[0].layers[len(seqnn_model.models[0].layers)-2].__dict__

{'_self_setattr_tracking': True,
 '_instrumented_keras_api': True,
 '_instrumented_keras_layer_class': True,
 '_instrumented_keras_model_class': False,
 '_trainable': True,
 '_stateful': False,
 '_build_input_shape': TensorShape([None, 99681, 48]),
 '_saved_model_inputs_spec': None,
 '_input_spec': InputSpec(min_ndim=2, axes={-1: 48}),
 '_name': 'final',
 '_activity_regularizer': None,
 '_trainable_weights': [<tf.Variable 'final/kernel:0' shape=(48, 1) dtype=float32, numpy=
  array([[-0.1495074 ],
         [-0.43212262],
         [ 0.07010241],
         [-0.20273937],
         [-0.00556026],
         [-0.25335294],
         [ 0.21087407],
         [ 0.05149443],
         [-0.157616  ],
         [-0.25510743],
         [-0.00843919],
         [-0.19865322],
         [ 0.13872993],
         [-0.0228241 ],
         [ 0.16513409],
         [-0.09377011],
         [-0.00490653],
         [-0.2851904 ],
         [-0.01204251],
         [-0.18476665],
         [ 0.15012386],
         [-0.2863

In [12]:
seqnn_model.models[0].layers[len(seqnn_model.models[0].layers)-2].trainable_weights

[<tf.Variable 'final/kernel:0' shape=(48, 1) dtype=float32, numpy=
 array([[-0.1495074 ],
        [-0.43212262],
        [ 0.07010241],
        [-0.20273937],
        [-0.00556026],
        [-0.25335294],
        [ 0.21087407],
        [ 0.05149443],
        [-0.157616  ],
        [-0.25510743],
        [-0.00843919],
        [-0.19865322],
        [ 0.13872993],
        [-0.0228241 ],
        [ 0.16513409],
        [-0.09377011],
        [-0.00490653],
        [-0.2851904 ],
        [-0.01204251],
        [-0.18476665],
        [ 0.15012386],
        [-0.28630257],
        [ 0.01459581],
        [ 0.00825187],
        [-0.28920376],
        [-0.25899786],
        [-0.3879861 ],
        [-0.10567426],
        [-0.42959467],
        [ 0.11309835],
        [ 0.3405903 ],
        [-0.26325902],
        [ 0.03624119],
        [ 0.42362356],
        [ 0.04657405],
        [ 0.0179613 ],
        [ 0.33479318],
        [ 0.17667927],
        [ 0.14149487],
        [ 0.10225577],
        [ 0.0

In [5]:
params['model'] = params_model
display(params_model)

{'seq_length': 1048576,
 'target_length': 512,
 'target_crop': 32,
 'diagonal_offset': 2,
 'augment_rc': True,
 'augment_shift': 11,
 'activation': 'relu',
 'batch_norm': True,
 'bn_momentum': 0.9265,
 'trunk': [{'name': 'conv_block',
   'filters': 96,
   'kernel_size': 11,
   'pool_size': 2},
  {'name': 'conv_tower',
   'filters_init': 96,
   'filters_mult': 1.0,
   'kernel_size': 5,
   'pool_size': 2,
   'repeat': 10},
  {'name': 'dilated_residual',
   'filters': 48,
   'rate_mult': 1.75,
   'repeat': 8,
   'dropout': 0.4},
  {'name': 'conv_block', 'filters': 64, 'kernel_size': 5}],
 'head_hic': [{'name': 'one_to_two', 'operation': 'mean'},
  {'name': 'concat_dist_2d'},
  {'name': 'conv_block_2d', 'filters': 48, 'kernel_size': 3},
  {'name': 'symmetrize_2d'},
  {'name': 'dilated_residual_2d',
   'filters': 24,
   'kernel_size': 3,
   'rate_mult': 1.75,
   'repeat': 6,
   'dropout': 0.1},
  {'name': 'cropping_2d', 'cropping': 32},
  {'name': 'upper_tri', 'diagonal_offset': 2},
  {'nam

In [6]:
# Écriture du fichier de paramètres
with open('/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/params_transfer.json','w') as params_transfer_file:
    json.dump(params,params_transfer_file) 

In [13]:
seqnn_model.models[0].layers[4].trainable_weights

[<tf.Variable 'conv1d/kernel:0' shape=(11, 4, 96) dtype=float32, numpy=
 array([[[-0.07367327,  0.29673436, -0.38978988, ...,  0.18835951,
          -0.22402896,  0.26617014],
         [ 0.08137295,  0.06885674,  0.03808732, ..., -0.10597277,
           0.45735794,  0.12072483],
         [ 0.02585143,  0.25985783,  0.09975421, ...,  0.08874109,
          -0.16820559, -0.07723998],
         [-0.08351176,  0.00336566,  0.11607262, ...,  0.41795528,
          -0.19520324,  0.29690835]],
 
        [[-0.2583351 , -0.19547321, -0.13105345, ...,  0.2129713 ,
          -0.23236448,  0.15093371],
         [ 0.06668539, -0.40592852,  0.18654715, ...,  0.03882132,
           0.10409798,  0.04169784],
         [ 0.12359282,  0.3346928 ,  0.08906922, ..., -0.15116563,
          -0.17119521,  0.14061804],
         [-0.01075523,  0.21174592,  0.3551315 , ...,  0.02801019,
           0.3427186 , -0.08903657]],
 
        [[-0.15981458,  0.19387636, -0.10830089, ..., -0.06271059,
          -0.44251245, 

In [15]:
seqnn_model.restore(model_file)

In [16]:
seqnn_model.models[0].layers[len(seqnn_model.models[0].layers)-2].trainable_weights

[<tf.Variable 'final/kernel:0' shape=(48, 1) dtype=float32, numpy=
 array([[ 1.4580915 ],
        [ 0.23465092],
        [ 0.05835523],
        [-0.6239604 ],
        [ 0.34154844],
        [-0.16338602],
        [ 1.2967775 ],
        [-0.42430592],
        [ 1.2869276 ],
        [ 0.03741875],
        [ 0.76618195],
        [ 0.75870216],
        [ 0.19104275],
        [-0.7300541 ],
        [-0.6849625 ],
        [-0.01603766],
        [ 0.55600345],
        [-3.1726544 ],
        [-0.03922657],
        [-1.273954  ],
        [-0.11047602],
        [ 0.16599712],
        [ 0.08342776],
        [ 1.9507604 ],
        [ 0.16429499],
        [-0.30267197],
        [-1.0408251 ],
        [ 0.0262637 ],
        [ 0.36577496],
        [ 0.19783096],
        [-1.0037417 ],
        [-0.06746949],
        [-0.02523711],
        [ 0.0058255 ],
        [-1.6376753 ],
        [ 0.46741402],
        [-0.60394293],
        [-1.542865  ],
        [-0.31098443],
        [-0.9531344 ],
        [ 0.0

In [17]:
seqnn_model.models[0].layers[4].trainable_weights

[<tf.Variable 'conv1d/kernel:0' shape=(11, 4, 96) dtype=float32, numpy=
 array([[[ 0.09355764,  0.08427623,  0.12153016, ..., -0.15557124,
          -0.24185126, -0.08866961],
         [-0.39165616,  0.22918619,  0.17072426, ..., -0.23238999,
           0.1473212 ,  0.1674815 ],
         [ 0.23334314,  0.13728172,  0.15159912, ..., -0.14496087,
           0.22836079, -0.45533156],
         [ 0.04052   ,  0.47208974,  0.40900332, ...,  0.18603727,
           0.02523968,  0.31608298]],
 
        [[ 0.13254096,  0.3413415 ,  0.03291266, ..., -0.21221818,
           0.03376405,  0.26585445],
         [-0.08990127, -0.04370791,  0.20237586, ..., -0.1671591 ,
           0.27642018,  0.12923117],
         [ 0.34807304,  0.05613009, -0.01062612, ...,  0.32746392,
          -0.12286341, -0.11060675],
         [-0.20968954,  0.04918202,  0.04545894, ..., -0.07714995,
           0.40476233, -0.1825966 ]],
 
        [[-0.24926293,  0.38959837,  0.08361185, ...,  0.32659858,
          -0.27865896, 

In [14]:
# Modèle estimé originellement sur 5 lignées cellulaires
model_orig_dir = "/home/bureau/projects/def-bureau/bureau/basenji/manuscripts/akita/"
model_orig_file  = model_orig_dir+'model_best.h5'

In [15]:
seqnn_model.restore(model_orig_file, by_name=True)

In [16]:
# Chargement normal des paramètres
seqnn_model.models[0].layers[4].trainable_weights

[<tf.Variable 'conv1d/kernel:0' shape=(11, 4, 96) dtype=float32, numpy=
 array([[[ 0.22121659,  0.49886006,  0.15402932, ...,  0.24297452,
          -0.04430944, -0.09108627],
         [-0.19714203,  0.2941896 , -0.64780694, ...,  0.35473865,
           0.56217223,  0.27214468],
         [-0.02342184, -0.37757406, -0.14093184, ..., -0.3096035 ,
           0.17684124, -0.7398407 ],
         [ 0.3871217 ,  0.23305272,  0.10894196, ...,  0.3413404 ,
          -0.37711176,  0.20859483]],
 
        [[-0.32202166,  0.6204084 ,  0.16070314, ..., -0.11446574,
           0.29151767,  0.7213151 ],
         [-0.11484676,  0.55838656, -0.03864141, ...,  0.30672222,
           0.12473122, -0.15597497],
         [ 0.39774656, -1.0061594 ,  0.08167613, ..., -0.23125121,
          -0.23747027, -0.14844704],
         [-0.52533245, -0.590564  , -0.421166  , ...,  0.22580624,
          -0.01874998, -0.06464076]],
 
        [[ 0.13614717,  0.06663982, -0.06246207, ...,  0.09325704,
           0.5666888 , 