In [None]:
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import NegativeBinomial

CHROMOSOMES = list(['chr' + str(i) for i in range(1,23)])
RESOLUTION = 10000

chrom_sizes_path = 'sizes_by_chrom.json'
gin_path = 'GSE63525_K562_10000.tsv'
nbyn_path = 'GSE63525_K562_10000_nbyn/GSE63525_K562_10000_nbyn_{}.txt'
hic_matrix_path = 'GSE63525_K562_10000_nbyn/GSE63525_K562_10000_nbyn_{}.txt'
snp_matrix_path = 'LAML_survival_nbyn/LAML.{}.snp_nxn.txt'
d_path = '63525_d_matrices/d_{}.txt'
result_path = 'results_d/K562_FULL_0_1.d_survival.txt'

d_new = []
hic_new = []
snp_new = [] 

In [None]:
def normalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

In [None]:
with open(chrom_sizes_path) as json_file:
    sizes = json.load(json_file)

In [None]:
arrays = {}
for key in sizes:
    size = (int(sizes[key]/RESOLUTION),int(sizes[key]/RESOLUTION))

    arrays[str(key)] = np.empty(size)

    print('Empty matrix for CHR',key,'is created with size',arrays[key].shape)

In [None]:
with open(gin_path,'r') as gfile:

    for line in gfile:
        curr = line.split('\t')
       
        chrom,i,j,value = curr[0], int(int(curr[1]) / RESOLUTION) - 1, int(int(curr[4]) / RESOLUTION) - 1, int(curr[-1])
        
        if chrom == 'MT':
            break

        #Check if not interchromosonal
        inter_check = chrom == curr[3]

        if inter_check:
            arrays['chr' + chrom][i][j] = value
            print('chr',chrom,i,j,value)

In [None]:
for chrom_num in arrays:
    print('Saving CHR',chrom_num)
    np.savetxt(nbyn_path.format(chrom_num),arrays[chrom_num],fmt = '%i')

In [None]:
for chrom in CHROMOSOMES:
    print(chrom,'\timporting hic matrix')
    hic_matrix = np.loadtxt(hic_matrix_path.format(chrom))

    print(chrom,'\timporting snp splice matrix')
    snp_matrix = np.loadtxt(snp_matrix_path.format(chrom))

    print(chrom,'\timporting d matrix')
    d_matrix = np.loadtxt(d_path.format(chrom))


    print(chrom,'\tpreparing data')
    hic_matrix = hic_matrix.flatten()
    snp_matrix = snp_matrix.flatten()
    d_matrix = d_matrix.flatten()

    for hic,snp,d in zip(hic_matrix,snp_matrix,d_matrix):
        
        if not(hic == snp == 0):
            hic_new.append(hic)
            snp_new.append(snp)
            d_new.append(d)

In [None]:
hic_new = normalizeData(hic_new)
d_new = normalizeData(d_new)
snp_new = normalizeData(snp_new)

In [None]:
data_dict = {
    'HiC':hic_new,
    'SNP Splice':snp_new,
    'D':d_new,
    'Intercept':np.ones(len(hic_new))
}
data = pd.DataFrame.from_dict(data_dict)

In [None]:
print(chrom,'\tcalculating linear regression')
model = sm.OLS(data['HiC'],data[['SNP Splice','Intercept','D']])
#model = NegativeBinomial(data['HiC'],data[['SNP Splice','Intercept','D'],loglike_method='geometric')
results = model.fit()

In [None]:
print(chrom,'\twriting results')
sourceFile = open(result_path, 'w')
print(results.summary())
print(results.summary(),file = sourceFile)
sourceFile.close()