In [43]:
import numpy as np
import time
# data  : (n, 500)  array you want to modify
# starts: length-n   list/array with the first index (inclusive) in every row
# ends  : length-n   list/array with the last index (exclusive) in every row

nb_reads = 100000
nb_mutations = 500
data = np.zeros((nb_reads, nb_mutations))

global_start_time = time.time()
start_time = time.time()
starts = [np.random.randint(0, 100) for _ in range(nb_reads)]
ends   = [starts[i] + np.random.randint(0, 1) for i in range(nb_reads)]
end_time = time.time()
print(f"Time to generate starts and ends: {end_time - start_time:.4f} seconds")
starts = np.array(starts)
ends = np.array(ends)
start_time = time.time()
cols = np.arange(data.shape[1])          # shape (500,)
mask = (cols >= starts[:, None]) & (cols < ends[:, None])
end_time = time.time()  
print(f"Time to generate mask: {end_time - start_time:.4f} seconds")
# mask[i, j] == True  ⇔  starts[i] ≤ j < ends[i]
start_time = time.time()
data[mask] = 1
end_time = time.time()
print(f"Time to fill data: {end_time - start_time:.4f} seconds")

print(f"Time to run the script: {time.time() - global_start_time:.4f} seconds")



Time to generate starts and ends: 0.5713 seconds
Time to generate mask: 0.1452 seconds
Time to fill data: 0.0058 seconds
Time to run the script: 0.7398 seconds


In [45]:
from itertools import chain

sets_list = [{6}, {2, 6, 5}]
result = chain.from_iterable(sets_list)

print(result)          # -> [6, 2, 6, 5]


<itertools.chain object at 0x0000015EF6E02CB0>


In [47]:
import itertools, random, timeit

# sample data (1000 sets, each with 10 ints)
data = [set(random.sample(range(1000), 10)) for _ in range(10)]

bench = {
    "itertools.chain":
        "list(itertools.chain.from_iterable(data))",
    "list-comp":
        "[x for s in data for x in s]",
    "extend loop":
        "out=[]\nfor s in data:\n    out.extend(s)"
}

for name, stmt in bench.items():
    t = timeit.timeit(stmt, globals=globals(), number=10_000)
    print(f"{name:14}: {t:.4f} s")


itertools.chain: 0.0176 s
list-comp     : 0.0289 s
extend loop   : 0.0115 s


In [51]:
#tesdt 3*5 boolean matrix
a = np.array([[True, False, True, False, True], [False, True, False, False, False], [True, False, True, False, True]])
np.sum(a, axis=0)


array([2, 1, 2, 0, 2])

In [7]:
a = [5,2,1,99,0]
a.sort()
a

[0, 1, 2, 5, 99]

In [1]:
import numpy as np

In [3]:
np.bincount([5,5,2,2,2,8,1,1], minlength=4)

array([0, 2, 3, 0, 0, 2, 0, 0, 1])

In [26]:
# %%
import numpy as np
import time
def get_sample_bootstrap_weight(weights, size=None):
    if size is None:
        size = np.sum(weights)
    positions = np.random.choice(
        np.arange(len(weights)),
        size=size,
        replace=True,
        p=weights / np.sum(weights),
    )
    return np.bincount(positions, minlength=len(weights))

In [30]:
a = [int(np.random.randint(1, 30)) for i in range(20000)]


In [31]:
start_time = time.time()
get_sample_bootstrap_weight(a)
print("time used:", time.time()-start_time)

time used: 0.0838174819946289


In [6]:
print(a)

None


In [2]:
a.sort()

In [None]:
>>> import random, timeit, itertools
>>> sets_list = [set(range(10)) for _ in range(1000)]

# chain.from_iterable
>>> timeit.timeit('list(itertools.chain.from_iterable(sets_list))',
...               globals=globals(), number=1000)
0.0098 s

# nested list-comprehension
>>> timeit.timeit('[x for s in sets_list for x in s]',
...               globals=globals(), number=1000)
0.0124 s


In [42]:
a = [[1, 2, 3], [4, 5, 6], [7, 8]]
ar = np.array(a)
print(ar[0])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3,) + inhomogeneous part.

In [21]:
import numpy as np
from scipy.sparse import csr_matrix

def create_mutation_sparse_matrix_vectorised(starts_idx_tuples,
                                             end_idx_tuples,
                                             mutations_data,
                                             number_of_mutations):
    """
    Same result as `create_mutation_sparse_matrix_optimised`, but the Python
    loop that *extends* row/col/data lists is removed.  Only four vectorised
    numpy calls remain.
    """
    n_segments = len(starts_idx_tuples)

    # 1) Covered positions for every segment (still vectorised *per* segment)
    covered = [_get_covered_positions_vectorized(s, e, number_of_mutations)
               for s, e in zip(starts_idx_tuples, end_idx_tuples)]

    # 2) How many covered positions per segment?
    lengths = np.fromiter((len(c) for c in covered), dtype=np.int32)

    # 3) Flatten covered positions & build row indices in one shot
    col_indices = np.concatenate(covered)                    # shape (nnz,)
    row_indices = np.repeat(np.arange(n_segments,
                                      dtype=np.int32),       # 0 … n_segments-1
                            lengths)                          # repeat each i

    # 4) Vectorised “is this a mutation?” for each segment
    data_values = np.concatenate([
        np.isin(cov, list(_extract_mutation_positions(muts)), assume_unique=True)
        for cov, muts in zip(covered, mutations_data)
    ]).astype(np.bool_)

    # 5) Build the sparse matrix once
    return csr_matrix((data_values,
                       (row_indices, col_indices)),
                      shape=(n_segments, number_of_mutations),
                      dtype=np.bool_)


def _get_covered_positions_vectorized(start_tuple, end_tuple, number_of_mutations):
    """
    Vectorized calculation of covered positions for a segment.
    """
    # Generate all covered positions using vectorized operations
    covered_ranges = [np.arange(start, end + 1, dtype=np.int32) 
                     for start, end in zip(start_tuple, end_tuple)]
    
    if not covered_ranges:
        return np.array([], dtype=np.int32)
    
    # Concatenate and deduplicate in one operation
    covered_positions = np.concatenate(covered_ranges)
    # covered_positions = np.unique(covered_positions)
    
    # Filter valid positions (within bounds)
    # covered_positions = covered_positions[covered_positions < number_of_mutations]
    
    return covered_positions


def _extract_mutation_positions(mutation_tuple):
    """
    Efficiently extract all mutation positions from nested tuple structure.
    """
    mutation_positions = set()
    
    for mut_item in mutation_tuple:
        if isinstance(mut_item, tuple):
            mutation_positions.update(mut_item)
        elif mut_item is not None:
            mutation_positions.add(mut_item)
    
    return mutation_positions


def _create_segment_data(covered_positions, mutation_positions):
    """
    Create data array for segment: False for covered positions, True for mutations.
    """
    # Start with all False (covered, no mutation)
    segment_data = np.zeros(len(covered_positions), dtype=np.bool_)
    
    # Mark mutations as True using vectorized boolean indexing
    if mutation_positions:
        mutation_array = np.array(list(mutation_positions), dtype=np.int32)
        mutation_mask = np.isin(covered_positions, mutation_array)
        segment_data[mutation_mask] = True
    
    return segment_data

In [23]:
# Test the optimized implementation
import time

# Create test data
n_segments = 1000000
number_of_mutations = 500

# Simulate realistic genomic data
np.random.seed(42)
starts_idx_tuples = []
end_idx_tuples = []
mutations_data = []

for i in range(n_segments):
    # Each segment has 1-3 coverage ranges
    n_ranges = np.random.randint(1, 4)
    starts = sorted(np.random.randint(0, 700, n_ranges))
    ends = [start + np.random.randint(10, 50) for start in starts]
    
    # Some mutations in covered regions
    mutations = []
    for start, end in zip(starts, ends):
        if np.random.random() < 0.3:  # 30% chance of mutations in this range
            n_mutations = np.random.randint(1, min(5, end-start+1))
            mut_positions = np.random.randint(start, end+1, n_mutations)
            mutations.append(tuple(mut_positions))
        else:
            mutations.append(tuple())
    
    starts_idx_tuples.append(tuple(starts))
    end_idx_tuples.append(tuple(ends))
    mutations_data.append(tuple(mutations))

print(f"Test data: {n_segments} segments, {number_of_mutations} positions")

# Test the optimized function
start_time = time.time()
sparse_result = create_mutation_sparse_matrix_vectorised(
    starts_idx_tuples, end_idx_tuples, mutations_data, number_of_mutations
)
end_time = time.time()

print(f"Optimized version completed in: {end_time - start_time:.4f} seconds")
print(f"Result shape: {sparse_result.shape}")
print(f"Non-zero elements: {sparse_result.nnz}")
print(f"Sparsity: {1 - sparse_result.nnz / (sparse_result.shape[0] * sparse_result.shape[1]):.4f}")
print(f"Memory efficiency: Only storing {sparse_result.nnz} values instead of {sparse_result.shape[0] * sparse_result.shape[1]} values")
print(f"Data type: {sparse_result.dtype}")

# Show a small sample of the results
print(f"\nSample data (first 5 segments, first 20 positions):")
sample = sparse_result[:5, :20].toarray()
print(sample)


Test data: 1000000 segments, 500 positions


ValueError: axis 1 index 748 exceeds matrix dimension 500

In [12]:
# Demonstration: Missing values vs 0s vs 1s in sparse matrix
print("=== Understanding sparse matrix representation ===")

# Create a small example to show the difference
small_starts = [tuple([0, 5]), tuple([2]), tuple([])]  # segment 3 has no coverage
small_ends = [tuple([2, 7]), tuple([4]), tuple([])]
small_mutations = [tuple([tuple([1]), tuple([6])]), tuple([tuple([3])]), tuple([])]

small_result = create_mutation_sparse_matrix_optimized(small_starts, small_ends, small_mutations, 10)

print("Small example:")
print("Segment 0: covers positions 0,1,2,5,6,7 with mutations at 1,6")
print("Segment 1: covers positions 2,3,4 with mutation at 3") 
print("Segment 2: covers no positions")
print()

# Convert to dense to see the actual values
dense_version = small_result.toarray()
print("Dense representation:")
print(dense_version)
print()

print("Key points:")
print("- 0 = covered position, NO mutation")
print("- 1 = covered position, WITH mutation") 
print("- 0 in dense view = NOT COVERED (missing data)")
print()

print("Sparse matrix details:")
print(f"Shape: {small_result.shape}")
print(f"Stored values: {small_result.data}")
print(f"Row indices: {small_result.indices}")  
print(f"Column indices: {small_result.indptr}")
print()

print("=== The sparse matrix ONLY stores covered positions ===")
print("- If a position is covered but no mutation: stores 0")
print("- If a position is covered with mutation: stores 1") 
print("- If a position is NOT covered: stores NOTHING (missing/implicit)")
print()
print("When you convert to dense (.toarray()), scipy fills missing entries with 0,")
print("but the original sparse matrix distinguishes between 'stored 0' and 'missing'")

# Show how to check if a position is actually covered vs missing
print("\n=== How to distinguish covered-no-mutation vs missing ===")
row, col = 0, 3  # segment 0, position 3
is_covered = small_result[row, col] != 0 or (row, col) in zip(*small_result.nonzero())
print(f"Position ({row},{col}) is covered: {is_covered}")

row, col = 0, 1  # segment 0, position 1 (covered, no mutation)  
has_explicit_entry = len(small_result[row, [col]].data) > 0
print(f"Position ({row},{col}) has explicit entry: {has_explicit_entry}")
print(f"Value at ({row},{col}): {small_result[row, col]}")


=== Understanding sparse matrix representation ===
Small example:
Segment 0: covers positions 0,1,2,5,6,7 with mutations at 1,6
Segment 1: covers positions 2,3,4 with mutation at 3
Segment 2: covers no positions

Dense representation:
[[False  True False False False False  True False False False]
 [False False False  True False False False False False False]
 [False False False False False False False False False False]]

Key points:
- 0 = covered position, NO mutation
- 1 = covered position, WITH mutation
- 0 in dense view = NOT COVERED (missing data)

Sparse matrix details:
Shape: (3, 10)
Stored values: [False  True False False  True False False  True False]
Row indices: [0 1 2 5 6 7 2 3 4]
Column indices: [0 6 9 9]

=== The sparse matrix ONLY stores covered positions ===
- If a position is covered but no mutation: stores 0
- If a position is covered with mutation: stores 1
- If a position is NOT covered: stores NOTHING (missing/implicit)

When you convert to dense (.toarray()), scip

In [None]:
# Test with my 4x8 example
starts_idx_tuples_2 = [(0,6), (1,), (3,5), (0,4)]
end_idx_tuples_2 = [(2,7), (3,), (4,6), (1,5)]
mutations_data_2 = [((1,), ()), ((),), ((3,), (6)), ((), (4,5))]
number_of_mutations_2 = 8

result2 = create_mutation_sparse_matrix_optimized(starts_idx_tuples_2, end_idx_tuples_2, mutations_data_2, number_of_mutations_2)
print("4x8 example result:")


4x8 example result:
[[False  True False False False False False False]
 [False False False False False False False False]
 [False False False  True False False  True False]
 [False False False False  True  True False False]]


In [14]:
result2

<Compressed Sparse Row sparse matrix of dtype 'bool'
	with 16 stored elements and shape (4, 8)>