In [None]:
import sys
import pysam
import numpy as np
import pandas as pd
import math
from scipy.spatial import distance
from scipy.spatial.distance import pdist, squareform
from scipy.stats import pearsonr, spearmanr

'''
dir="/Users/mhryansohn/Desktop/01.Workspace/01.Projects/03.AK1-PacBio/01.DNA/Public_Data/HG002"

The continuous probability range of 0.0 to 1.0 is remapped to the discrete integers 0 to 255 
inclusively in the ML tag. The probability range corresponding to an integer N is N/256 to (N + 1)/256.
'''

def num_match_rate_calculation(read_segment):
    ref_match = 0
    read_segment_length = read_segment.query_length
    for i in read_segment.cigartuples:
        if i[0] == 7:
            ref_match += i[1]
    return (ref_match / read_segment_length)

def get_read_level_cg(read_segment, pos):
    if read_segment.modified_bases != {}:
        read_alignment_start = read_segment.get_reference_positions()[0]
        read_alignment_end = read_segment.get_reference_positions()[-1]
        if read_alignment_start <= pos[0] and read_alignment_end >= pos[-1]:
            cg_ml = dict(list(read_segment.modified_bases.values())[0]) # read-level cytosine location and ML
            cg_met = list()
            if read_segment.is_reverse:
                cg_ml = {key-1: value for key, value in cg_ml.items()}
            for k,v in read_segment.get_aligned_pairs(): # read-based coordinate and corresponding reference coordinate
                if v in pos:
                    try:
                        cg_met.append(cg_ml[k]/256)
                    except KeyError:
                        cg_met.append(np.nan)
            return np.array(cg_met)

def make_cg_matrix(alignmentfile, refcgtab, coordinate):
    cg_pos = list(map(lambda x: int(x), refcgtab.loc[coordinate]['CG_Pos'].rstrip(';').split(';')))
    chrom, start, end = refcgtab.loc[coordinate]['chromosome'], refcgtab.loc[coordinate]['start'], refcgtab.loc[coordinate]['end']
    cg_matrix_list = list()
    for read in alignmentfile.fetch(chrom, start, end):
        read_level_cg = get_read_level_cg(read, cg_pos)
        if np.all(read_level_cg != None):
            cg_matrix_list.append(read_level_cg)
    return np.array(cg_matrix_list)

def remove_nan_column_from_cg_matrix(matrix):
    mask = np.isnan(matrix)
    mask = ~mask.any(axis=0)
    matrix = matrix[:, mask]
    return matrix

def get_pairwise_distances_from_cg_matrix(matrix):
    # input matrix from: remove_nan_column_from_cg_matrix()
    
    '''
    tile 내부의 read 개수 cutoff에 대한 내용이 들어가는 것이 필요하다
    '''
    
    n_elements = math.comb(matrix.shape[0], 2)
    # Average pairwise cosine similarity (problem: distance 1 and -1 would cancel out)
    lower_tril_cos = np.tril(squareform(1- pdist(matrix, 'cosine')), k=-1)
    average_pairwise_cos_similarity = np.sum(lower_tril_cos) / n_elements
    # Average pairwise euclidean distance
    lower_tril_euc = np.tril(squareform(pdist(matrix, 'euclidean')), k=-1)
    average_pairwise_euc_distance = np.sum(lower_tril_euc) / n_elements
    # Other Metrics... (TBD)
    
    return average_pairwise_cos_similarity, average_pairwise_euc_distance

try: 
    bamfile = pysam.AlignmentFile(sys.argv[1], 'rb') # "chr20_500000-1000000_AK1.bam"
    cgwindowfile = pd.read_table(sys.argv[2], index_col=0) # "10kb_CG_Information_hg38_primary_chrX.tab"
    with open(f"{sys.argv[3]}.tab", 'w') as outfile:
        outfile.write('ID\tchrom\tstart\tend\tMean_cos\tMean_euc\n')
        for i in cgwindowfile.index:
            chrom, start, end = cgwindowfile.loc[i]['chromosome'], cgwindowfile.loc[i]['start'], cgwindowfile.loc[i]['end']
            cg_matrix = make_cg_matrix(bamfile, cgwindowfile, i)
            if cg_matrix.shape[0] > 1:
                cg_matrix = remove_nan_column_from_cg_matrix(cg_matrix)
                #problem 1: 
                # 모든 vector를 모은 matrix에서 np.nan이 아닌 column이 하나인 경우 (e.g., chr1:122670000-122680000)
                # 이 경우 각각의 read가 하나의 element를 갖는 vector가 되므로, cosine similarity는 np.nan이 된다.
                #problem 2:
                # 모든 vector를 모은 matrix에서 np.nan이 전부인 경우 (e.g., chr1:2660000-2670000)
                # 이 경우 remove_nan_column_from_cg_matrix에서 array([], shape=(56, 0), dtype=float64)와 같이 나온다.
                # 따라서 cosine similarity는 np.nan이 되고 euclidean은 0이 된다.
                
                # 위 모든 경우에서 ref_cg의 cg개수 중에 몇 퍼센트 이상을 모두 cover하는 경우 보겠다는 threshold 필요.
                
                distances = get_pairwise_distances_from_cg_matrix(cg_matrix)
                outfile.write(f'{i}\t{chrom}\t{start}\t{end}\t{distances[0]}\t{distances[1]}\n')
                outfile.flush()
    bamfile.close()

except IndexError:
    print('\n')
    print('Please Provide Input Arguments\n')
    print('usage: python _heterogeneity_calculation.py [PacBio Bam File] [Reference CG window] [Output prefix]')
    print('\n')

'''
with open("test_cos.bg", 'w') as rfh1, open("test_euc.bg", 'w') as rfh2:
    for i in ref_10kb_cg.iloc[260800:260850].index:
        chrom, start, end = ref_10kb_cg.loc[i]['chromosome'], ref_10kb_cg.loc[i]['start'], ref_10kb_cg.loc[i]['end']
        cg_matrix = make_cg_matrix(ak1, ref_10kb_cg, i)
        if cg_matrix.shape[0] > 1:
            cg_matrix = remove_nan_column_from_cg_matrix(cg_matrix)    
            distances = get_pairwise_distances_from_cg_matrix(cg_matrix)
            rfh1.write(f'{chrom}\t{start}\t{end}\t{distances[0]}\n')
            rfh2.write(f'{chrom}\t{start}\t{end}\t{distances[1]}\n')
            rfh1.flush()
            rfh2.flush()
        #print(get_pairwise_distances_from_cg_matrix(cg_matrix))


ref_10kb_step1kb_cg = pd.read_table("10kb_step1kb_CG_Information_hg38_primary_chrX.tab", index_col=0)
with open("test_cos2.bg", 'w') as rfh1, open("test_euc2.bg", 'w') as rfh2:
    for i in ref_10kb_step1kb_cg.loc['chr20:500000-510000': 'chr20:990000-1000000'].index:
        chrom, start, end = ref_10kb_step1kb_cg.loc[i]['chromosome'], ref_10kb_step1kb_cg.loc[i]['start'], ref_10kb_step1kb_cg.loc[i]['end']
        cg_matrix = make_cg_matrix(ak1, ref_10kb_step1kb_cg, i)
        if cg_matrix.shape[0] > 1:
            cg_matrix = remove_nan_column_from_cg_matrix(cg_matrix)    
            distances = get_pairwise_distances_from_cg_matrix(cg_matrix)
            rfh1.write(f'{chrom}\t{start}\t{end}\t{distances[0]}\n')
            rfh2.write(f'{chrom}\t{start}\t{end}\t{distances[1]}\n')
            rfh1.flush()
            rfh2.flush()
        #print(get_pairwise_distances_from_cg_matrix(cg_matrix))
    

test_matrix1 = remove_nan_column_from_cg_matrix(make_cg_matrix(ak1, ref_10kb_cg, 'chr20:500000-510000'))
test_matrix2 = remove_nan_column_from_cg_matrix(make_cg_matrix(ak1, ref_10kb_cg, 'chr20:510000-520000'))


for i in range(test_matrix1.shape[0]):
    for j in range(i+1, matrix.shape[0]):
        vec_i = test_matrix1[i, :]
        vec_j = test_matrix1[j, :]
        euclidean_dist = distance.euclidean(vec_i, vec_j)
        cosine_sim = 1 - distance.cosine(vec_i, vec_j)
        pearson_corr, _ = pearsonr(vec_i, vec_j)
        spearman_corr, _ = spearmanr(vec_i, vec_j)
        print(f"Distance Metrics between vector {i+1} and vector {j+1}: {euclidean_dist}, {cosine_sim}, {pearson_corr}, {spearman_corr}")

# Test
full_length_count = list() # (#read, #CG), #CG for chekc with ref_10kb_cg

for i in ref_10kb_cg[ref_10kb_cg.index.str.startswith('chr20')].index:
    cg_mat = make_cg_matrix(ak1, ref_10kb_cg, i)
    full_length_count.append(np.shape(cg_mat))

full_length_count = list() # (#read, #CG), #CG for chekc with ref_10kb_cg
for i in ref_10kb_cg.iloc[260800:260850].index:
    cg_mat = make_cg_matrix(ak1, ref_10kb_cg, i)
    full_length_count.append(np.shape(cg_mat))
    
for i,v in enumerate(full_length_count):
    if v[1] != ref_10kb_cg.iloc[260800:260850].iloc[i]['CG_Num']:
        print('fuck')


'''
    
