In [None]:
# load X_mask_sum.csv
import numpy as np

a = np.loadtxt("../../X_mask_sum.csv", delimiter="\n")

In [2]:
import numpy as np
from scipy.sparse import csr_matrix

def create_mutation_sparse_matrix(starts_idx_tuples, end_idx_tuples, mutations_data, number_of_mutations):
    """
    Create a sparse matrix representing mutations across read segments.
    
    Args:
        starts_idx_tuples: List of tuples with start positions (inclusive) for each segment
        end_idx_tuples: List of tuples with end positions (inclusive) for each segment  
        mutations_data: List of tuples with mutation positions for each segment
        number_of_mutations: Total number of genomic positions (columns)
    
    Returns:
        scipy.sparse.csr_matrix: Sparse matrix of size len(segments) × number_of_mutations
    """
    n_segments = len(starts_idx_tuples)
    
    # Initialize dense matrix with NaN
    matrix = np.full((n_segments, number_of_mutations), np.nan)
    
    for segment_idx in range(n_segments):
        start_tuple = starts_idx_tuples[segment_idx]
        end_tuple = end_idx_tuples[segment_idx]
        mutation_tuple = mutations_data[segment_idx]
        
        # Get all covered positions for this segment
        covered_positions = set()
        for start_pos, end_pos in zip(start_tuple, end_tuple):
            covered_positions.update(range(start_pos, end_pos))  
        
        # Mark covered positions as 0 (no mutation)
        for pos in covered_positions:
            if pos < number_of_mutations:
                matrix[segment_idx, pos] = 0
        
        # Extract all mutation positions from the nested tuples
        mutation_positions = set()
        for mut_tuple in mutation_tuple:
            if isinstance(mut_tuple, tuple):
                mutation_positions.update(mut_tuple)
            else:
                mutation_positions.add(mut_tuple)
        
        # Mark mutation positions as 1
        for pos in mutation_positions:
            if pos < number_of_mutations and pos in covered_positions:
                matrix[segment_idx, pos] = 1
    
    return csr_matrix(matrix)

# Test with your original example
starts_idx_tuples = [(0,8),(2,5)]
end_idx_tuples = [(2,12),(7,9)]
mutations_data = [((1,), (8,10)), ((), (6))]
number_of_mutations = 15

result = create_mutation_sparse_matrix(starts_idx_tuples, end_idx_tuples, mutations_data, number_of_mutations)
print("Original example result:")
print(result.toarray())

print("\n" + "="*50)

# Test with my 4x8 example
starts_idx_tuples_2 = [(0,6), (1,), (3,5), (0,4)]
end_idx_tuples_2 = [(2,7), (3,), (4,6), (1,5)]
mutations_data_2 = [((1,), ()), ((),), ((3,), (6)), ((), (4,5))]
number_of_mutations_2 = 8

result2 = create_mutation_sparse_matrix(starts_idx_tuples_2, end_idx_tuples_2, mutations_data_2, number_of_mutations_2)
print("4x8 example result:")
print(result2.toarray())

Original example result:
[[ 0.  1. nan nan nan nan nan nan  1.  0.  1.  0. nan nan nan]
 [nan nan  0.  0.  0.  0.  1.  0.  0. nan nan nan nan nan nan]]

4x8 example result:
[[ 0.  1. nan nan nan nan  0. nan]
 [nan  0.  0. nan nan nan nan nan]
 [nan nan nan  1. nan  0. nan nan]
 [ 0. nan nan nan  1. nan nan nan]]


In [1]:
import pandas as pd

# Create an example DataFrame
data = {
    'qname': ['A', 'A', 'B', 'B', 'C', 'A', 'C'],
    'startIdx_0Based': [10, 15, 30, 35, 50, 20, 55],
    'endIdx_0Based': [20, 25, 40, 45, 60, 30, 65],
    'muts': [('X',), ('Y', 'Z'), ('W',), ('V',), ('U', 'T'), ('S',), ('R', 'Q')]
}

results = pd.DataFrame(data)

print("Original DataFrame:")
print(results)
print("\n")

# Group by 'qname' and aggregate other columns into tuples
grouped_results = results.groupby('qname').agg(lambda x: tuple(x)).reset_index()

# Rename the columns to indicate they're now tuples
# grouped_results.columns = ['qname', 'startIdx_0Based_tuple', 'endIdx_0Based_tuple', 'muts_tuple']

print("Grouped DataFrame:")
print(grouped_results)

Original DataFrame:
  qname  startIdx_0Based  endIdx_0Based    muts
0     A               10             20    (X,)
1     A               15             25  (Y, Z)
2     B               30             40    (W,)
3     B               35             45    (V,)
4     C               50             60  (U, T)
5     A               20             30    (S,)
6     C               55             65  (R, Q)


Grouped DataFrame:
  qname startIdx_0Based endIdx_0Based                  muts
0     A    (10, 15, 20)  (20, 25, 30)  ((X,), (Y, Z), (S,))
1     B        (30, 35)      (40, 45)          ((W,), (V,))
2     C        (50, 55)      (60, 65)      ((U, T), (R, Q))


In [3]:
(10,2) == (2,10)

False

In [7]:
results_ablolute_positions = (
        results.groupby(["startIdx_0Based"])
        .agg({"endIdx_0Based": "sum", })
        .reset_index()
    )

SyntaxError: invalid syntax (1705827042.py, line 3)

In [6]:
results_ablolute_positions

Unnamed: 0,startIdx_0Based,endIdx_0Based
0,10,20
1,15,25
2,20,30
3,30,40
4,35,45
5,50,60
6,55,65


In [None]:
import numpy as np
from scipy.sparse import csr_matrix

def create_mutation_sparse_matrix(starts_idx_tuples, end_idx_tuples, mutations_data, number_of_mutations):
    """
    Optimized sparse matrix creation for genomic mutation data.
    
    Args:
        starts_idx_tuples: List of tuples with start positions (inclusive) for each segment
        end_idx_tuples: List of tuples with end positions (inclusive) for each segment
        mutations_data: List of tuples with mutation positions for each segment
        number_of_mutations: Total number of genomic positions (columns)
    
    Returns:
        scipy.sparse.csr_matrix: Sparse matrix where:
        - 0 = covered position, no mutation
        - 1 = covered position, mutation present  
        - nan = position not covered by this read segment
    """
    n_segments = len(starts_idx_tuples)
    
    # Collect sparse matrix components efficiently
    row_indices = []
    col_indices = []
    data_values = []
    
    for segment_idx in range(n_segments):
        start_tuple = starts_idx_tuples[segment_idx]
        end_tuple = end_idx_tuples[segment_idx]
        mutation_tuple = mutations_data[segment_idx]
        
        if len(start_tuple) == 0:
            continue
            
        # Vectorized range expansion - much faster than Python loops
        covered_ranges = [np.arange(start, end + 1) for start, end in zip(start_tuple, end_tuple)]
        covered_positions = np.concatenate(covered_ranges)
        
        # Remove duplicates and filter valid positions in one operation
        covered_positions = np.unique(covered_positions)
        covered_positions = covered_positions[covered_positions < number_of_mutations]
        
        if len(covered_positions) == 0:
            continue
        
        # Extract mutation positions efficiently
        mutation_positions = set()
        for mut_tuple in mutation_tuple:
            if isinstance(mut_tuple, tuple):
                mutation_positions.update(mut_tuple)
            elif mut_tuple is not None:
                mutation_positions.add(mut_tuple)
        
        # Create segment data: default to 0 (covered, no mutation)
        segment_data = np.zeros(len(covered_positions), dtype=np.float64)
        
        # Mark mutations as 1 using vectorized operations
        if mutation_positions:
            mutation_array = np.array(list(mutation_positions))
            mutation_mask = np.isin(covered_positions, mutation_array)
            segment_data[mutation_mask] = 1.0
        
        # Collect data for sparse matrix construction
        row_indices.extend([segment_idx] * len(covered_positions))
        col_indices.extend(covered_positions)
        data_values.extend(segment_data)
    
    # Convert to efficient numpy arrays
    row_indices = np.array(row_indices, dtype=np.int32)
    col_indices = np.array(col_indices, dtype=np.int32)
    data_values = np.array(data_values, dtype=np.int32)
    
    # Initialize result matrix with NaN (unavoidable for NaN semantics)
    result_matrix = np.full((n_segments, number_of_mutations), np.nan, dtype=np.float64)
    
    # Efficiently populate only covered positions
    if len(row_indices) > 0:
        result_matrix[row_indices, col_indices] = data_values
    
    return csr_matrix(result_matrix)

In [5]:
# Test with my 4x8 example
starts_idx_tuples_2 = [(0,6), (1,), (3,5), (0,4)]
end_idx_tuples_2 = [(2,7), (3,), (4,6), (1,5)]
mutations_data_2 = [((1,), ()), ((),), ((3,), (6)), ((), (4,5))]
number_of_mutations_2 = 8

result2 = create_mutation_sparse_matrix(starts_idx_tuples_2, end_idx_tuples_2, mutations_data_2, number_of_mutations_2)
print("4x8 example result:")
print(result2.toarray())

4x8 example result:
[[ 0.  1.  0. nan nan nan  0.  0.]
 [nan  0.  0.  0. nan nan nan nan]
 [nan nan nan  1.  0.  0.  1. nan]
 [ 0.  0. nan nan  1.  1. nan nan]]
