# Examen d'un modèle Basenji

In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf

In [3]:
from basenji import dataset
from basenji import seqnn
from basenji import trainer

In [4]:
params_file = "/home/bureau/projects/def-bureau/basenji/manuscripts/akita/params.json"
#params_file = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/params1.json"
model_dir = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/train_out/"
model_file  = model_dir+'model_best.h5'

In [5]:
#data_stats_file = "/home/bureau/projects/def-bureau/bureau/distiller/results_Neu10000/data/1m/statistics.json"
data_stats_file = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/statistics.json"

In [6]:
#tfr_train_full = "/home/bureau/projects/def-bureau/bureau/distiller/results_Neu10000/data/1m/tfrecords/train-*.tfr"
#tfr_eval_full = "/home/bureau/projects/def-bureau/bureau/distiller/results_Neu10000/data/1m/tfrecords/valid-*.tfr"
tfr_train_full = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/train-*.tfr"
tfr_eval_full = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/valid-*.tfr"

## Chargement des paramètres du modèle

In [7]:
with open(params_file) as params_open:
    params = json.load(params_open)
params_model = params['model']
params_train = params['train']
params_model['head_hic'][-1]['units'] =1
params_model['head_hic'][-1]['kernel_initializer'] ='he_normal'

In [8]:
params_model['head_hic']

[{'name': 'one_to_two', 'operation': 'mean'},
 {'name': 'concat_dist_2d'},
 {'name': 'conv_block_2d', 'filters': 48, 'kernel_size': 3},
 {'name': 'symmetrize_2d'},
 {'name': 'dilated_residual_2d',
  'filters': 24,
  'kernel_size': 3,
  'rate_mult': 1.75,
  'repeat': 6,
  'dropout': 0.1},
 {'name': 'cropping_2d', 'cropping': 32},
 {'name': 'upper_tri', 'diagonal_offset': 2},
 {'name': 'final',
  'units': 1,
  'activation': 'linear',
  'kernel_initializer': 'he_normal'}]

## Chargement des stats des données

In [9]:
with open(data_stats_file) as data_stats_open:
    data_stats = json.load(data_stats_open)
data_stats

{'num_targets': 1,
 'train_seqs': 7617,
 'valid_seqs': 6676,
 'test_seqs': 6667,
 'seq_length': 1048576,
 'pool_width': 2048,
 'crop_bp': 65536,
 'diagonal_offset': 2,
 'target_length': 99681}

## Chargement des données

In [9]:
train_data = dataset.SeqDataset(tfr_train_full,
    params_train['batch_size'],
    data_stats['seq_length'],
    data_stats['target_length'],
    tf.estimator.ModeKeys.TRAIN)
eval_data = dataset.SeqDataset(tfr_eval_full,
    params_train['batch_size'],
    data_stats['seq_length'],
    data_stats['target_length'],
    tf.estimator.ModeKeys.EVAL)


/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/train-*.tfr has 7617 sequences with 1/1 targets
/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/tfrecords/valid-*.tfr has 6676 sequences with 1/1 targets


## Initialisation du modèle

In [8]:
seqnn_model = seqnn.SeqNN(params_model)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sequence (InputLayer)           [(None, 1048576, 4)] 0                                            
__________________________________________________________________________________________________
stochastic_reverse_complement ( ((None, 1048576, 4), 0           sequence[0][0]                   
__________________________________________________________________________________________________
stochastic_shift (StochasticShi (None, 1048576, 4)   0           stochastic_reverse_complement[0][
__________________________________________________________________________________________________
re_lu (ReLU)                    (None, 1048576, 4)   0           stochastic_shift[0][0]           
____________________________________________________________________________________________

In [11]:
seqnn_model.models[0].layers[len(seqnn_model.models[0].layers)-2].__dict__

{'_self_setattr_tracking': True,
 '_instrumented_keras_api': True,
 '_instrumented_keras_layer_class': True,
 '_instrumented_keras_model_class': False,
 '_trainable': True,
 '_stateful': False,
 '_build_input_shape': TensorShape([None, 99681, 48]),
 '_saved_model_inputs_spec': None,
 '_input_spec': InputSpec(min_ndim=2, axes={-1: 48}),
 '_name': 'final',
 '_activity_regularizer': None,
 '_trainable_weights': [<tf.Variable 'final/kernel:0' shape=(48, 1) dtype=float32, numpy=
  array([[-0.1495074 ],
         [-0.43212262],
         [ 0.07010241],
         [-0.20273937],
         [-0.00556026],
         [-0.25335294],
         [ 0.21087407],
         [ 0.05149443],
         [-0.157616  ],
         [-0.25510743],
         [-0.00843919],
         [-0.19865322],
         [ 0.13872993],
         [-0.0228241 ],
         [ 0.16513409],
         [-0.09377011],
         [-0.00490653],
         [-0.2851904 ],
         [-0.01204251],
         [-0.18476665],
         [ 0.15012386],
         [-0.2863

In [12]:
seqnn_model.models[0].layers[len(seqnn_model.models[0].layers)-2].trainable_weights

[<tf.Variable 'final/kernel:0' shape=(48, 1) dtype=float32, numpy=
 array([[-0.1495074 ],
        [-0.43212262],
        [ 0.07010241],
        [-0.20273937],
        [-0.00556026],
        [-0.25335294],
        [ 0.21087407],
        [ 0.05149443],
        [-0.157616  ],
        [-0.25510743],
        [-0.00843919],
        [-0.19865322],
        [ 0.13872993],
        [-0.0228241 ],
        [ 0.16513409],
        [-0.09377011],
        [-0.00490653],
        [-0.2851904 ],
        [-0.01204251],
        [-0.18476665],
        [ 0.15012386],
        [-0.28630257],
        [ 0.01459581],
        [ 0.00825187],
        [-0.28920376],
        [-0.25899786],
        [-0.3879861 ],
        [-0.10567426],
        [-0.42959467],
        [ 0.11309835],
        [ 0.3405903 ],
        [-0.26325902],
        [ 0.03624119],
        [ 0.42362356],
        [ 0.04657405],
        [ 0.0179613 ],
        [ 0.33479318],
        [ 0.17667927],
        [ 0.14149487],
        [ 0.10225577],
        [ 0.0

In [38]:
[v for v in enumerate(seqnn_model.models[0].trainable_variables)]

[(0,
  <tf.Variable 'conv1d/kernel:0' shape=(11, 4, 96) dtype=float32, numpy=
  array([[[ 0.17527972,  0.18966903, -0.0098265 , ..., -0.19941744,
            0.25646266,  0.09396492],
          [-0.04783365,  0.14030947, -0.0372556 , ..., -0.06824101,
           -0.12866512,  0.21571434],
          [-0.32806262, -0.05470107, -0.09630734, ...,  0.02512477,
            0.4698007 , -0.3875631 ],
          [-0.20699707, -0.03127084,  0.39806792, ...,  0.01880061,
            0.18585294,  0.13550626]],
  
         [[-0.1259084 ,  0.18078114, -0.1598435 , ...,  0.09549499,
           -0.01887866, -0.0251871 ],
          [-0.04740414,  0.14091554,  0.01152264, ..., -0.03630615,
           -0.20302396, -0.10000688],
          [ 0.02668644, -0.04143291, -0.4841453 , ..., -0.25132322,
            0.10892735, -0.17127022],
          [ 0.06604652,  0.32581103,  0.34306332, ...,  0.1659231 ,
            0.3337271 ,  0.20769496]],
  
         [[ 0.04411942,  0.09787275, -0.03151598, ...,  0.18563174

In [40]:
seqnn_model.models[0].trainable_variables[84]

<tf.Variable 'conv2d/kernel:0' shape=(3, 3, 65, 48) dtype=float32, numpy=
array([[[[ 2.82234922e-02,  3.51982773e-04,  5.11365477e-03, ...,
          -8.84396769e-03,  8.15481246e-02,  8.29085559e-02],
         [-4.20807712e-02, -4.04452309e-02,  1.62901059e-02, ...,
           1.76863354e-02,  2.32108142e-02,  1.17635109e-01],
         [-1.32020786e-01,  1.43664004e-02, -1.25476066e-02, ...,
          -1.06684357e-01,  6.26465492e-03,  3.78464386e-02],
         ...,
         [ 9.40428078e-02,  4.91002277e-02,  4.26616035e-02, ...,
           6.44592717e-02,  8.71419683e-02, -3.42139639e-02],
         [-1.23799898e-01, -8.01017508e-02,  2.65967827e-02, ...,
          -9.98719260e-02,  2.34445389e-02,  5.88299520e-02],
         [ 1.02370009e-01,  6.21097088e-02,  3.45776863e-02, ...,
          -1.76522955e-02, -3.99286747e-02,  1.78466085e-02]],

        [[ 1.00840747e-01, -9.02380515e-03, -6.65497482e-02, ...,
          -2.95654424e-02,  3.42078023e-02,  9.97294486e-02],
         [ 3.8

In [32]:
len(seqnn_model.models[0].trainable_variables)

125

### Modification des paramètres d'entrainement

In [14]:
params_train['batch_size'] = 20
params_train

{'batch_size': 20,
 'optimizer': 'sgd',
 'learning_rate': 0.0065,
 'momentum': 0.99575,
 'loss': 'mse',
 'patience': 12,
 'clip_norm': 10.0}

In [15]:
params['model'] = params_model
params['train'] = params_train
display(params)

{'train': {'batch_size': 20,
  'optimizer': 'sgd',
  'learning_rate': 0.0065,
  'momentum': 0.99575,
  'loss': 'mse',
  'patience': 12,
  'clip_norm': 10.0},
 'model': {'seq_length': 1048576,
  'target_length': 512,
  'target_crop': 32,
  'diagonal_offset': 2,
  'augment_rc': True,
  'augment_shift': 11,
  'activation': 'relu',
  'batch_norm': True,
  'bn_momentum': 0.9265,
  'trunk': [{'name': 'conv_block',
    'filters': 96,
    'kernel_size': 11,
    'pool_size': 2},
   {'name': 'conv_tower',
    'filters_init': 96,
    'filters_mult': 1.0,
    'kernel_size': 5,
    'pool_size': 2,
    'repeat': 10},
   {'name': 'dilated_residual',
    'filters': 48,
    'rate_mult': 1.75,
    'repeat': 8,
    'dropout': 0.4},
   {'name': 'conv_block', 'filters': 64, 'kernel_size': 5}],
  'head_hic': [{'name': 'one_to_two', 'operation': 'mean'},
   {'name': 'concat_dist_2d'},
   {'name': 'conv_block_2d', 'filters': 48, 'kernel_size': 3},
   {'name': 'symmetrize_2d'},
   {'name': 'dilated_residual_2d

In [16]:
# Écriture du fichier de paramètres
with open('/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/params_transfer.json','w') as params_transfer_file:
    json.dump(params,params_transfer_file) 

In [13]:
seqnn_model.models[0].layers[4].trainable_weights

[<tf.Variable 'conv1d/kernel:0' shape=(11, 4, 96) dtype=float32, numpy=
 array([[[-0.07367327,  0.29673436, -0.38978988, ...,  0.18835951,
          -0.22402896,  0.26617014],
         [ 0.08137295,  0.06885674,  0.03808732, ..., -0.10597277,
           0.45735794,  0.12072483],
         [ 0.02585143,  0.25985783,  0.09975421, ...,  0.08874109,
          -0.16820559, -0.07723998],
         [-0.08351176,  0.00336566,  0.11607262, ...,  0.41795528,
          -0.19520324,  0.29690835]],
 
        [[-0.2583351 , -0.19547321, -0.13105345, ...,  0.2129713 ,
          -0.23236448,  0.15093371],
         [ 0.06668539, -0.40592852,  0.18654715, ...,  0.03882132,
           0.10409798,  0.04169784],
         [ 0.12359282,  0.3346928 ,  0.08906922, ..., -0.15116563,
          -0.17119521,  0.14061804],
         [-0.01075523,  0.21174592,  0.3551315 , ...,  0.02801019,
           0.3427186 , -0.08903657]],
 
        [[-0.15981458,  0.19387636, -0.10830089, ..., -0.06271059,
          -0.44251245, 

## Examen du modèle estimé par transfert

In [10]:
seqnn_model.restore(model_file)

In [11]:
# Il y a eu des changements dans les paramètres par rapport aux valeurs initiales
seqnn_model.models[0].layers[len(seqnn_model.models[0].layers)-2].trainable_weights

[<tf.Variable 'final/kernel:0' shape=(48, 1) dtype=float32, numpy=
 array([[ 0.07303954],
        [-0.02175031],
        [ 0.0521398 ],
        [ 0.16584271],
        [ 0.07815055],
        [-0.04387111],
        [-0.76704097],
        [-0.07411462],
        [-0.12736693],
        [ 0.0351777 ],
        [ 0.04528734],
        [-0.18306538],
        [ 0.07494818],
        [ 0.02592948],
        [-0.26398137],
        [ 0.1230846 ],
        [ 0.21265659],
        [ 0.11021022],
        [ 0.03557133],
        [ 0.02509378],
        [-0.24670736],
        [-0.10753597],
        [-0.04211397],
        [-0.06754594],
        [-0.09070039],
        [ 0.02338321],
        [ 0.05276643],
        [ 0.01170031],
        [-0.04386963],
        [-0.33030644],
        [ 0.25801834],
        [-0.04451451],
        [-0.14600106],
        [-0.2739374 ],
        [ 0.03348623],
        [-0.06300657],
        [ 0.05365289],
        [ 0.02701247],
        [ 0.05919943],
        [ 0.02551196],
        [-0.0

In [12]:
seqnn_model.models[0].layers[4].trainable_weights

[<tf.Variable 'conv1d/kernel:0' shape=(11, 4, 96) dtype=float32, numpy=
 array([[[ 0.22121659,  0.49886006,  0.15402932, ...,  0.24297452,
          -0.04430944, -0.09108627],
         [-0.19714203,  0.2941896 , -0.64780694, ...,  0.35473865,
           0.56217223,  0.27214468],
         [-0.02342184, -0.37757406, -0.14093184, ..., -0.3096035 ,
           0.17684124, -0.7398407 ],
         [ 0.3871217 ,  0.23305272,  0.10894196, ...,  0.3413404 ,
          -0.37711176,  0.20859483]],
 
        [[-0.32202166,  0.6204084 ,  0.16070314, ..., -0.11446574,
           0.29151767,  0.7213151 ],
         [-0.11484676,  0.55838656, -0.03864141, ...,  0.30672222,
           0.12473122, -0.15597497],
         [ 0.39774656, -1.0061594 ,  0.08167613, ..., -0.23125121,
          -0.23747027, -0.14844704],
         [-0.52533245, -0.590564  , -0.421166  , ...,  0.22580624,
          -0.01874998, -0.06464076]],
 
        [[ 0.13614717,  0.06663982, -0.06246207, ...,  0.09325704,
           0.5666888 , 

In [4]:
# Modèle estimé originellement sur 5 lignées cellulaires
model_orig_dir = "/home/bureau/projects/def-bureau/bureau/basenji/manuscripts/akita/"
model_orig_file  = model_orig_dir+'model_best.h5'

In [9]:
seqnn_model.restore(model_orig_file, by_name=True)

In [10]:
# Chargement des paramètres
seqnn_model.models[0].layers[4].trainable_weights
# On constate qu'ils n'ont pas changé, tel qu'attendu

[<tf.Variable 'conv1d/kernel:0' shape=(11, 4, 96) dtype=float32, numpy=
 array([[[ 0.22121659,  0.49886006,  0.15402932, ...,  0.24297452,
          -0.04430944, -0.09108627],
         [-0.19714203,  0.2941896 , -0.64780694, ...,  0.35473865,
           0.56217223,  0.27214468],
         [-0.02342184, -0.37757406, -0.14093184, ..., -0.3096035 ,
           0.17684124, -0.7398407 ],
         [ 0.3871217 ,  0.23305272,  0.10894196, ...,  0.3413404 ,
          -0.37711176,  0.20859483]],
 
        [[-0.32202166,  0.6204084 ,  0.16070314, ..., -0.11446574,
           0.29151767,  0.7213151 ],
         [-0.11484676,  0.55838656, -0.03864141, ...,  0.30672222,
           0.12473122, -0.15597497],
         [ 0.39774656, -1.0061594 ,  0.08167613, ..., -0.23125121,
          -0.23747027, -0.14844704],
         [-0.52533245, -0.590564  , -0.421166  , ...,  0.22580624,
          -0.01874998, -0.06464076]],
 
        [[ 0.13614717,  0.06663982, -0.06246207, ...,  0.09325704,
           0.5666888 , 

In [12]:
hic_targets = pd.read_csv('/home/bureau/projects/def-bureau/basenji/manuscripts/akita/data/targets.txt',sep='\t')
hic_targets

Unnamed: 0,index,identifier,file,clip,description
0,0,HFF,../../data/coolers/Krietenstein2019/HFF_hg38_4...,2,HFF
1,1,H1hESC,../../data/coolers/Krietenstein2019/H1hESC_hg3...,2,H1hESC
2,2,GM12878,../../data/coolers/Rao2014/GM12878_inSitu_MboI...,2,GM12878
3,3,IMR90,../../data/coolers/Rao2014/IMR90_inSitu_MboI_a...,2,IMR90
4,4,HCT116,../../data/coolers/Rao2017/Unsynchronized_all....,2,HCT116


In [29]:
tmp = seqnn_model.models[0].layers[len(seqnn_model.models[0].layers)-2].trainable_weights

In [30]:
# Coefficients de régression de la couche finale pour les 5 lignées cellulaires
beta_final = pd.DataFrame(tmp[0].numpy(),columns=hic_targets['identifier'])
beta_final

identifier,HFF,H1hESC,GM12878,IMR90,HCT116
0,-0.029231,-0.152174,-0.1154,-0.172805,-0.095331
1,-0.113028,0.107823,-0.06205,0.14515,-0.059022
2,-0.236473,0.171282,0.04121,-0.226253,0.053832
3,0.026269,0.260259,0.407433,0.069439,-0.132155
4,0.088598,0.307082,-0.040133,-0.321567,0.360107
5,-0.11302,0.038943,-0.13053,0.148494,0.074912
6,0.302577,0.077084,-0.27635,-0.116355,-0.164745
7,0.407187,0.248542,0.41091,-0.019401,-0.120526
8,0.097274,-0.074655,-0.135475,0.055874,-0.220505
9,0.141011,-0.076823,0.081839,0.065283,0.230731
