# Estimation de la couche finale d'Akita sur les données de neurones

In [1]:
import json
import numpy as np
import pandas as pd
import sklearn
import statsmodels.api as sm

In [2]:
import h5py
import cooler

In [3]:
import matplotlib.pyplot as plt

In [4]:
predpath = "/home/bureau/projects/def-bureau/bureau/ran-donnees/PredictNeuronHi-C/akita_pred_sans_final/"
predfile = predpath + "preds.h5"

In [5]:
pred = h5py.File(predfile, 'r')
pred

<HDF5 file "preds.h5" (mode r)>

In [6]:
pred['preds'].shape

(7617, 99681, 48)

In [7]:
targetfile = "/home/bureau/projects/def-bureau/bureau/distiller/iPSC/data/1m/seqs_cov/0.h5"

In [8]:
targets = h5py.File(targetfile, 'r')
targets

<HDF5 file "0.h5" (mode r)>

In [9]:
# Les données d'élaboration sont les 7617 premières
train_targets = targets['targets'][:7617,]
train_targets.shape

(7617, 99681)

In [14]:
X = np.array(pred['preds'][0,:pred['preds'].shape[1],:pred['preds'].shape[2]],dtype=np.float32)
X.shape

(99681, 48)

In [18]:
X = sm.add_constant(X)
X.shape

(99681, 49)

In [19]:
mod0 = sm.OLS(train_targets[0,:],X)
mod0.fit = mod0.fit()

In [20]:
mod0.fit.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.384
Model:,OLS,Adj. R-squared:,0.384
Method:,Least Squares,F-statistic:,1294.0
Date:,"Mon, 22 Mar 2021",Prob (F-statistic):,0.0
Time:,17:42:41,Log-Likelihood:,-45880.0
No. Observations:,99681,AIC:,91860.0
Df Residuals:,99632,BIC:,92320.0
Df Model:,48,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.6099,0.019,-32.636,0.000,-0.647,-0.573
x1,0.2036,0.051,4.001,0.000,0.104,0.303
x2,0.2304,0.026,8.868,0.000,0.179,0.281
x3,0.3926,0.027,14.412,0.000,0.339,0.446
x4,-0.6268,0.037,-16.721,0.000,-0.700,-0.553
x5,0.5518,0.043,12.979,0.000,0.468,0.635
x6,-0.0086,0.019,-0.445,0.656,-0.046,0.029
x7,0.4214,0.044,9.586,0.000,0.335,0.508
x8,-0.2067,0.016,-13.012,0.000,-0.238,-0.176

0,1,2,3
Omnibus:,957.317,Durbin-Watson:,0.25
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1166.336
Skew:,-0.172,Prob(JB):,5.41e-254
Kurtosis:,3.403,Cond. No.,109.0


In [26]:
# Nombre de coefficients
npar = pred['preds'].shape[2]+1
beta_mat = np.zeros((pred['preds'].shape[0],npar))
xx_mat = np.zeros((pred['preds'].shape[0],npar,npar))
xx_sum = np.zeros((npar,npar))
# Boucle sur les lots
for i in range(pred['preds'].shape[0]):
    # Extraction de la matrice de prédicteurs
    X = np.array(pred['preds'][i,:pred['preds'].shape[1],:pred['preds'].shape[2]],dtype=np.float32)
    X = sm.add_constant(X)
    mod = sm.OLS(train_targets[i,:],X)
    mod.fit = mod.fit()
    beta_mat[i,] = mod.fit.params
    xx_mat[i,] = np.linalg.inv(mod.fit.normalized_cov_params)
    xx_sum = xx_sum + xx_mat[i,]

In [27]:
# Inversion de la somme des matrices de variance-covariance
cov = np.linalg.inv(xx_sum)
cov.shape

(49, 49)

## Calcul de l'estimation des moindre carrés des coefficients par la méthode de Duncan (1980)

In [36]:
beta_vec = np.zeros(npar)
for i in range(pred['preds'].shape[0]):
    beta_vec = beta_vec + np.matmul(xx_mat[i,],beta_mat[i,])
beta_final = np.matmul(cov,beta_vec)
beta_final

array([-0.14365427,  0.03716058,  0.00913258,  0.00646756, -0.06668746,
       -0.01382421,  0.02838141, -0.0331073 ,  0.00930681, -0.02549175,
        0.00200029, -0.0287394 ,  0.0241323 , -0.00530443, -0.03601322,
       -0.07475538,  0.02253388,  0.05151738,  0.08732799,  0.0392308 ,
       -0.05292522, -0.04670851, -0.00482304,  0.0405061 , -0.05858171,
       -0.00043112,  0.01271191, -0.06108393,  0.03988695, -0.01182013,
       -0.12794256,  0.04344437,  0.05920321, -0.1368567 ,  0.05342997,
        0.03366394,  0.03072383, -0.00156791, -0.0258284 ,  0.0145607 ,
       -0.07120221, -0.04266817, -0.02940599,  0.01157515,  0.01895863,
        0.02440564,  0.07749435, -0.0216817 , -0.04454472])

## Sauvegarde des estimations et de leur "covariance" dans des fichiers

In [38]:
beta_dat = pd.DataFrame(beta_final)
beta_dat.to_csv("beta_final.csv")

In [40]:
cov_dat = pd.DataFrame(cov)
cov_dat.to_csv("cov_final.csv")

In [28]:
cov[:5,:5]

array([[ 7.81623490e-08, -4.12422926e-08,  1.41749005e-08,
        -3.32535393e-09, -3.71748822e-08],
       [-4.12422926e-08,  9.49262669e-07, -4.95162185e-08,
        -1.86750756e-07,  3.89316001e-08],
       [ 1.41749005e-08, -4.95162185e-08,  2.92936146e-07,
         2.32040006e-08, -1.63872665e-08],
       [-3.32535393e-09, -1.86750756e-07,  2.32040006e-08,
         3.16390862e-07, -2.57898510e-08],
       [-3.71748822e-08,  3.89316001e-08, -1.63872665e-08,
        -2.57898510e-08,  3.44137587e-07]])

In [30]:
beta_mat[1000,]

array([-0.12876581,  0.59567164, -0.10425682, -0.58185182,  0.79916392,
        1.07649432, -0.00874789,  0.50833046,  0.68730851,  0.80410378,
       -0.31972619, -0.09315363, -0.13082563,  0.0176904 , -0.94442803,
        0.38489407, -1.42583074, -1.75421009, -0.20989175,  0.25105951,
       -0.41360156, -1.88181577,  0.45778917, -0.85761263,  0.0346512 ,
        0.83140108,  0.22325217,  1.67136999,  0.81944921, -0.45695786,
       -1.88022625,  0.91111394,  0.7087806 , -0.70370532,  1.25285385,
       -0.43602057, -0.03923691,  0.07304694,  0.521765  ,  0.39678756,
       -0.08339328,  0.49544063,  0.28782657, -0.55793223,  0.40057599,
       -1.73714716, -0.60790451, -0.69965674,  0.47394429])