In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.spatial.distance import pdist, squareform


# CC Library
What are the Hamming distances between designed tiles?

In [None]:
# # What is the distance between HawkBCss?
# initial_HawkBCs_qual = initial[initial["HawkBCs_qual"] == True]
# initial_HawkBCs_qual_counts = pd.DataFrame(initial_HawkBCs_qual["HawkBCs"].value_counts()).reset_index()
# initial_HawkBCs_qual_unique = initial_HawkBCs_qual_counts["HawkBCs"]
# HawkBCs_dist = nearest_neighbors_parallel_df(list(initial_HawkBCs_qual_unique))
# HawkBCs_dist

In [None]:
design_file = pd.read_csv("../../data/DNA_Tiles_nkx2_2.txt",header=None)
design_file

In [None]:
# def add_min_hamming_dist(seq_list):
#     # Step 1: Convert to 2D char array
#     seq_array = np.array([list(seq) for seq in seq_list])

#     # Step 2: Compute pairwise Hamming distances
#     dist_matrix = pdist(seq_array, metric=lambda u, v: np.sum(u != v))
#     dist_matrix_square = squareform(dist_matrix)

#     # Step 3: Set diagonal to inf and compute minimum per sequence
#     np.fill_diagonal(dist_matrix_square, np.inf)
#     min_pairwise_distances = np.min(dist_matrix_square, axis=1)

#     # Step 4: Combine tiles and min distances into a DataFrame
#     df = pd.DataFrame({
#         'Tile': seq_list,
#         'Min_Hamming_Distance': min_pairwise_distances
#     })

#     return df

In [None]:
# def hamming_distance(seq1, seq2):
#     """Compute Hamming distance between two equal-length sequences."""
#     return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))

# def add_min_hamming_dist(query_seqs, ref_seqs=None, n_jobs=-1, batch_size=100):
#     """
#     Compute the minimum Hamming distance for each sequence in query_seqs 
#     against all sequences in ref_seqs (or within query_seqs if ref_seqs is None).

#     Args:
#         query_seqs (list[str]): Sequences to evaluate.
#         ref_seqs (list[str] | None): Reference sequences. If None, use query_seqs.
#         n_jobs (int): Number of parallel jobs (default = -1, all cores).
#         batch_size (int): Chunk size for parallelization.

#     Returns:
#         pd.DataFrame: query sequence + min hamming distance.
#     """
#     if ref_seqs is None:
#         ref_seqs = query_seqs
    
#     # Ensure array format
#     query_seqs = np.array(query_seqs)
#     ref_seqs = np.array(ref_seqs)

#     def min_dist_for_seq(seq, ref_seqs):
#         dists = (hamming_distance(seq, ref) for ref in ref_seqs if ref != seq)
#         return min(dists, default=np.inf)

#     # Parallel with progress bar
#     results = Parallel(n_jobs=n_jobs, batch_size=batch_size)(
#         delayed(min_dist_for_seq)(seq, ref_seqs) for seq in tqdm(query_seqs, desc="Computing min Hamming")
#     )

#     return pd.DataFrame({
#         "Tile": query_seqs,
#         "Min_Hamming_Distance": results
#     })

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from rapidfuzz.distance import Hamming

def add_min_hamming_dist(query_seqs, ref_seqs=None, batch_size=1000):
    """
    Compute the minimum Hamming distance for each sequence in query_seqs
    against all sequences in ref_seqs (or within query_seqs if ref_seqs is None).

    Args:
        query_seqs (list[str]): Sequences to evaluate.
        ref_seqs (list[str] | None): Reference sequences. If None, use query_seqs.
        batch_size (int): Number of queries to process at once.

    Returns:
        pd.DataFrame: query sequence + min hamming distance.
    """
    if ref_seqs is None:
        ref_seqs = query_seqs

    query_seqs = np.array(query_seqs, dtype="U")
    ref_seqs = np.array(ref_seqs, dtype="U")

    results = []
    for i in tqdm(range(0, len(query_seqs), batch_size), desc="Computing min Hamming"):
        batch = query_seqs[i:i+batch_size]
        for q in batch:
            dists = (Hamming.distance(q, r) for r in ref_seqs if r != q)
            results.append(min(dists, default=np.inf))

    return pd.DataFrame({
        "Tile": query_seqs,
        "Min_Hamming_Distance": results
    })


In [None]:
EC_design_file = pd.read_csv("../../data/a10_designfile.csv")
EC_design_file

In [None]:
# # How similar are Emily's sequences to one another?
# EC_df = add_min_hamming_dist(EC_design_file["ArrayDNA"])
# EC_df.to_csv("../data/EC_design_file_hamming_dist.csv")

In [None]:
EC_df = pd.read_csv("../../data/EC_design_file_hamming_dist.csv", index_col = 0)
EC_df

In [None]:
# 90% of sequences in Emily's library have a minimum hamming distance over 6
EC_df[EC_df["Min_Hamming_Distance"] > 6]

In [None]:
EC_df["Min_Hamming_Distance"].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.title("Emily's Design File")
sns.histplot(EC_df["Min_Hamming_Distance"], binwidth = 1)
sns.despine()

In [None]:
CC_df = add_min_hamming_dist(design_file[0])
CC_df

In [None]:
CC_df[CC_df["Min_Hamming_Distance"] > 3]

In [None]:
CC_df["Min_Hamming_Distance"].value_counts()

In [None]:
# Only 25% of sequences in Caitlin's design file have hamming distance over 6
CC_df[CC_df["Min_Hamming_Distance"] > 6]

In [None]:
plt.title("Caitlin's Design File")
sns.histplot(CC_df["Min_Hamming_Distance"], binwidth = 1)
sns.despine()

In [None]:
sns.set_context('talk')
plt.figure(dpi = 300, figsize = (6,4))
sns.histplot(EC_df["Min_Hamming_Distance"], binwidth = 1, stat = 'density', label = "GCN4")
sns.histplot(CC_df["Min_Hamming_Distance"], binwidth = 1, stat = 'density', alpha = 0, edgecolor = 'none')
plt.xlabel("Minimum Hamming Distance")
plt.legend()
sns.despine()

In [None]:
sns.set_context('talk')
plt.figure(dpi = 300, figsize = (6,4))
sns.histplot(EC_df["Min_Hamming_Distance"], binwidth = 1, stat = 'density', label = "GCN4")
sns.histplot(CC_df["Min_Hamming_Distance"], binwidth = 1, stat = 'density', label = "NKX2-2")
plt.xlabel("Minimum Hamming Distance")
plt.legend()
sns.despine()

Does each AD have only 1 Hawkins BC?

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
from dask.diagnostics import ProgressBar
import duckdb

os.chdir("../scripts")
from mapping import BarcodeMapper
from map_refiner import MapRefiner

In [None]:
mapper = BarcodeMapper(['../data/Staller_Tl4s1_MVS_0035_I1TTGTCACCAA_CGCACGAACA_S326.fastq.gz.assembled.fastq'],
                        "../data/DNA_Tiles_nkx2_2.txt",
                        ["ADBC2", "HawkBCs", "RTBC", "AD"],
                        ["CTCGAGATAACTTCGTATAATGTATGCTAT", "GAGCTCGCTAGC", "GGCCGGCCATAGGGCCCC", "CACCATG"],
                        ["GGCCGGCCATAGGGCCCC", "CTCGAGATAA", "GCGGTCCA", "GGATCCG"],
                        [6, 9, 16, 162],
                      reverse_complement=False)
mapped_df = mapper.create_map()
mapped_df.head()

In [None]:
mapper.save_parquet('../output/CC_nkx2_2.parquet')

In [None]:
refiner = MapRefiner(db_path = "../duckdb/CC_nkx2_2",
                      cols = ["ADBC2", "HawkBCs", "RTBC", "AD"],
                     reads_threshold = 5,
                     column_pairs = [("AD", ("ADBC2", "HawkBCs")), (("ADBC2", "HawkBCs"), "RTBC")])

In [None]:
refiner.create_map1_initial('../output/CC_nkx2_2.parquet/*')
initial = refiner.get_map_df('map1_initial')
initial

In [None]:
# initial map must have correct length of AD, ADBC2, Hawk BC, and RTBC
quality = initial[(initial["ADBC2_qual"] == True) & (initial["HawkBCs_qual"] == True) & (initial["RTBC_qual"] == True) & (initial["AD_qual"] == True)]
quality

In [None]:
quality_designed = quality[quality["Designed"] == 1]
quality_designed

In [None]:
plt.figure(dpi = 300)
sns.histplot(quality_designed["AD"].value_counts(), binwidth = 20)
sns.despine()

In [None]:
np.mean(quality_designed["AD"].value_counts())

In [None]:
quality_not_designed = quality[quality["Designed"] == 0]
quality_not_designed

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed

def min_hamming_distance_parallel(seq_list, ref_list, n_jobs=-1):
    """
    Compute minimum Hamming distance from each sequence in seq_list to ref_list
    in parallel with a progress bar.
    
    Returns a DataFrame with Sequence and Min_Hamming_Distance.
    """
    # Filter out "0" in ref_list
    ref_list = [s for s in ref_list if s != "0"]

    # Convert to 2D char arrays
    seq_array = np.array([list(seq) for seq in seq_list])
    ref_array = np.array([list(seq) for seq in ref_list])

    if seq_array.shape[1] != ref_array.shape[1]:
        raise ValueError("Sequences are not all the same length.")

    # Function to compute min Hamming for one sequence
    def compute_min(i):
        dists = np.sum(seq_array[i] != ref_array, axis=1)
        return dists.min()

    # Parallel computation with tqdm
    min_distances = Parallel(n_jobs=n_jobs)(
        delayed(lambda i: compute_min(i))(i) for i in tqdm(range(seq_array.shape[0]))
    )

    # Build DataFrame
    df = pd.DataFrame({
        'Sequence': seq_list,
        'Min_Hamming_Distance': min_distances
    })

    return df


In [None]:
# What is the distance between reads which pass quality check and are designed vs not designed?
quality_not_designed_dist = min_hamming_distance_parallel(quality_not_designed["AD"], quality_designed["AD"])
quality_not_designed_dist

In [None]:
quality_not_designed_dist["Min_Hamming_Distance"].value_counts()

In [None]:
sns.histplot(quality_not_designed_dist["Min_Hamming_Distance"], binwidth = 1, edgecolor = 'none')
sns.despine()

In [None]:
P_0_reads = (1 - 0.001) ** 120
P_1_read = 120 * (0.001) * (1 - (0.001)) ** 119
P_2_reads = 120 * (0.001 / 3) ** 2 * ((1 - (0.001 / 3)) ** 119) ** 2
P_3_reads = 120 * (0.001 / 3) ** 3 * ((1 - (0.001 / 3)) ** 119) ** 3
P_4_reads = 120 * (0.001 / 3) ** 4 * ((1 - (0.001 / 3)) ** 119) ** 4

In [None]:
1 - P_0_reads - P_1_read - P_2_reads - P_3_reads - P_4_reads

In [None]:
200 * (1 - P_0_reads - P_1_read - P_2_reads - P_3_reads - P_4_reads)

In [None]:
quality_not_designed_dist[quality_not_designed_dist["Min_Hamming_Distance"] == 1]["Sequence"].iloc[2]

In [None]:
quality[quality["AD"] == "AGCCTGCTGGGCCAGAGCATGGACGAGAGCGGCCTGCCTCAGCTGACCAGCTACGACTGCGAGGTGAACGCTCCCATCCAGGGCAGCAGAAACCTGCTGCAGGGCGAGGAGCTGCTGAGAGCCCTGGACCAGGTGAACGGCAGCGGCAGCGGCAGCGGCAGC"]

In [None]:
shared_hawk = quality[quality["HawkBCs"] == "CCACAGAAC"]
shared_hawk

In [None]:
quality_designed[quality_designed["HawkBCs"] == "CCACAGAAC"]["AD"].value_counts()

In [None]:
quality_designed[quality_designed["AD"] == "AGACTGCTGGGCCAGAGCATGGACGAGAGCGGCCTGCCTCAGCTGACCAGCTACGACTGCGAGGTGAACGCTCCCATCCAGGGCAGCGACAACCTGCTGCAGGGCGAGGAGCTGCTGGACGCCCTGGACCAGGTGAACGGCAGCGGCAGCGGCAGCGGCAGC"]["HawkBCs"].value_counts()

In [None]:
min_hamming_distance_parallel(quality_not_designed[quality_not_designed["HawkBCs"] == "AAGTTAGCC"]["AD"], quality_designed[quality_designed["HawkBCs"] == "AAGTTAGCC"]["AD"])#["Min_Hamming_Distance"].value_counts()

In [None]:
# Many reads which are not designed are close to designed
quality_not_designed_dist["Min_Hamming_Distance"].value_counts()

# Hawkins BCs

In [None]:
# How many Hawk BCs per AD?
hawk_bcs_count = initial[["AD", "HawkBCs"]].drop_duplicates().groupby("AD").count()
hawk_bcs_count.rename(columns={"HawkBCs": "HawkBCs_Count"}, inplace=True)
hawk_bcs_count = hawk_bcs_count.reset_index()
hawk_bcs_count

In [None]:
hawk_bcs_count["HawkBCs_Count"].value_counts()

In [None]:
# What do the BCs look like if an AD has multiple -- some seem like sequencing errors, and others are distinct

# Distinct
initial[initial["AD"] == hawk_bcs_count[hawk_bcs_count["HawkBCs_Count"] == 2]["AD"].iloc[3] ]["HawkBCs"].value_counts()

In [None]:
# Sequencing error
initial[initial["AD"] == hawk_bcs_count[hawk_bcs_count["HawkBCs_Count"] == 2]["AD"].iloc[0] ]["HawkBCs"].value_counts()

In [None]:
#  Distinct
initial[initial["AD"] == hawk_bcs_count[hawk_bcs_count["HawkBCs_Count"] == 3]["AD"].iloc[2] ]["HawkBCs"].value_counts()

In [None]:
initial[initial["AD"] == hawk_bcs_count[hawk_bcs_count["HawkBCs_Count"] == 4]["AD"].iloc[0] ]["HawkBCs"].value_counts()

In [None]:
initial[initial["AD"] == hawk_bcs_count[hawk_bcs_count["HawkBCs_Count"] == 5]["AD"].iloc[0] ]["HawkBCs"].value_counts()

In [None]:
initial[initial["AD"] == hawk_bcs_count[hawk_bcs_count["HawkBCs_Count"] == 6]["AD"].iloc[0] ]["HawkBCs"].value_counts()

In [None]:
initial[initial["AD"] == hawk_bcs_count[hawk_bcs_count["HawkBCs_Count"] == 69]["AD"].iloc[0] ]["HawkBCs"].value_counts()

In [None]:
# Majority of rows only have 1 HawkBC
# So maybe if a row has multiple Hawk BCs, those are sequencing errors -- OR switching of BCs via recombination
hawk_bcs_count["HawkBCs_Count"].value_counts()

# What is the distribution of average hamming distances between ADs that share a reporter BC?

In [None]:
initial_no_na = quality.dropna()
initial_no_na

In [None]:
initial_ad_bc_pairs = initial_no_na[["ADBC2", "AD"]].value_counts().reset_index()
initial_ad_bc_pairs

In [None]:
initial_ad_bc_pairs["ADBC2"].value_counts()

In [None]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

# Example hamming distance function
def hamming_distance(seq1, seq2):
    return sum(ch1 != ch2 for ch1, ch2 in zip(seq1, seq2))

def hamming_to_most_frequent(group):
    """
    For one group of ADs with the same ADBC2:
    - Find AD with most count
    - Compute hamming distance of all ADs to that AD
    """
    # pick most_frequent AD = highest count
    most_frequent_row = group.loc[group["count"].idxmax()]
    most_frequent_ad = most_frequent_row["AD"]

    # compute distances to most_frequent
    distances = group["AD"].apply(lambda s: hamming_distance(s, most_frequent_ad))
    
    return pd.DataFrame({
        "ADBC2": group["ADBC2"].iloc[0],
        "AD": group["AD"],
        "count": group["count"],
        "Hamming_to_most_frequent": distances
    })

# --- Parallel execution ---
groups = [g for _, g in initial_ad_bc_pairs.groupby("ADBC2")]

results = Parallel(n_jobs=-1)(
    delayed(hamming_to_most_frequent)(group) for group in tqdm(groups, desc="Computing distances")
)

# Concatenate results
dist_df = pd.concat(results, ignore_index=True)

# # Quick summary
# print(dist_df.head())

# # Example distribution plot
# import matplotlib.pyplot as plt
# plt.hist(dist_df["Hamming_to_most_frequent"], bins=50)
# plt.xlabel("Hamming distance to most_frequent AD")
# plt.ylabel("Count")
# plt.title("Distribution of distances per ADBC2")
# plt.show()


In [None]:
dist_df

In [None]:
ax = sns.scatterplot(data = dist_df, x = "count", y = "Hamming_to_most_frequent", s = 3, edgecolor = "none", alpha = 0.1)
plt.xlabel("Reads")
sns.despine()

In [None]:
sns.histplot(dist_df["Hamming_to_most_frequent"], binwidth = 1)
sns.despine()

In [None]:
initial_ad_bc_pairs_with_dists = pd.merge(initial_ad_bc_pairs, dist_df)
initial_ad_bc_pairs_with_dists

In [None]:
# Calculate the average Hamming distance to the most frequent sequence per ADBC2
avg_hamming_per_adbc2 = initial_ad_bc_pairs_with_dists.groupby("ADBC2")["Hamming_to_most_frequent"].transform("mean")

# Add the calculated column to the DataFrame
initial_ad_bc_pairs_with_dists["Avg_Hamming_to_Most_Frequent"] = avg_hamming_per_adbc2
initial_ad_bc_pairs_with_dists

In [None]:
non_dom_reads = initial_ad_bc_pairs_with_dists[initial_ad_bc_pairs_with_dists["Hamming_to_most_frequent"] > 0]
non_dom_reads

In [None]:
non_dom_reads[non_dom_reads["count"] > 5].sort_values(by = "Avg_Hamming_to_Most_Frequent")

In [None]:
sns.histplot(non_dom_reads[non_dom_reads["count"] < 5]["Hamming_to_most_frequent"], binwidth = 1)
sns.despine()

In [None]:
initial_ad_bc_pairs_with_dists[initial_ad_bc_pairs_with_dists["ADBC2"] == "GAACAA"]

In [None]:
initial_ad_bc_pairs_with_dists[initial_ad_bc_pairs_with_dists["ADBC2"] == "GTGTGA"]

In [None]:
non_dom_reads[non_dom_reads["count"] < 5].sort_values(by = ["Hamming_to_most_frequent", "Avg_Hamming_to_Most_Frequent"])

In [None]:
initial_ad_bc_pairs_with_dists[initial_ad_bc_pairs_with_dists["ADBC2"] == "GGTGGT"]["AD"].iloc[1]

In [None]:
initial_ad_bc_pairs_with_dists[initial_ad_bc_pairs_with_dists["ADBC2"] == "ATAAAC"]

In [None]:
non_dom_reads[non_dom_reads["count"] > 5].sort_values(by = "Hamming_to_most_frequent").head(20)

In [None]:
initial_ad_bc_pairs_with_dists[initial_ad_bc_pairs_with_dists["ADBC2"] == "TGAGTT"]

In [None]:
initial_ad_bc_pairs_with_dists[initial_ad_bc_pairs_with_dists["ADBC2"] == "TGAGTT"]

In [None]:
initial_ad_bc_pairs_with_dists[initial_ad_bc_pairs_with_dists["ADBC2"] == "GAAGAT"]

In [None]:
# Are there BCs with low average Hamming distances and high counts?
low_hamming_bcs_high_counts = initial_ad_bc_pairs_with_dists[
    (initial_ad_bc_pairs_with_dists["Avg_Hamming"] < 5) & 
    (initial_ad_bc_pairs_with_dists["count"] > 5)
]
low_hamming_bcs_high_counts

In [None]:
initial_ad_bc_pairs_with_dists[initial_ad_bc_pairs_with_dists["ADBC2"] == "CCGTAT"] 

In [None]:
avg_dist_df[avg_dist_df["Avg_Hamming"] <5]

In [None]:
initial_ad_bc_pairs[initial_ad_bc_pairs["ADBC2"] == "AAAAGT"] 

In [None]:
initial_ad_bc_pairs[initial_ad_bc_pairs["ADBC2"] == "TTTCCA"] 

In [None]:
initial_ad_bc_pairs[initial_ad_bc_pairs["ADBC2"] == "TTTCCA"] 

In [None]:
initial_ad_bc_pairs[initial_ad_bc_pairs["ADBC2"] == "TTTGAC"] 