### **ReadsQC_250_length.tsv** ###

In [1]:
import pandas as pd

def hamming_distance(s1, s2):
    """Calculate the Hamming distance between two strings."""
    return sum(el1 != el2 for el1, el2 in zip(s1, s2))

def merge_rows_based_on_hamming_distance(file_path):
    # Read the TSV file
    df = pd.read_csv(file_path, sep='\t')

    # Copy the DataFrame to keep track of merged rows
    merged_df = df.copy()

    # Create a list to keep track of indexes that have been merged
    merged_indexes = []

    for i, row_i in df.iterrows():
        for j, row_j in df.iterrows():
            if i != j and i not in merged_indexes and j not in merged_indexes:
                # Calculate the Hamming distance between barcodes
                if hamming_distance(row_i['Barcode'], row_j['Barcode']) < 3:
                    # Merge the rows by summing their numeric values
                    merged_row = row_i.copy()
                    for col in df.columns[1:]:
                        merged_row[col] = row_i[col] + row_j[col]
                    # Update the merged DataFrame
                    merged_df.loc[i] = merged_row
                    # Mark the rows as merged
                    merged_indexes.append(j)

    # Remove the rows that were merged into another row
    merged_df.drop(index=merged_indexes, inplace=True)

    # Reset the index of the merged DataFrame
    merged_df.reset_index(drop=True, inplace=True)

    # Save the merged DataFrame to a new CSV file
    output_file = 'merged_' + file_path.split('/')[-1].replace('.tsv', '.csv')
    merged_df.to_csv(output_file, index=False)

    print(f"Merged DataFrame saved as '{output_file}'.")

if __name__ == '__main__':
    file_path = '/media/asus/275dd380-2319-4638-bcdd-5f65b2b1d4b5/CHRF_Project_Data/Single_Cell/Analysis_Ouput_from_Beenet_of_Single_Cell_CHRF/Beenet_SC_NP_H_250/Beenet_SC_NP_H_7063/SC_NP_H_7063_20240125114410_ReadsQC_250_length.tsv'
    merge_rows_based_on_hamming_distance(file_path)


Merged DataFrame saved as 'merged_SC_NP_H_7063_20240125114410_ReadsQC_250_length.csv'.


### **RCM_edit.tsv**

In [3]:
import pandas as pd

# Define the file path
file_path = '/home/asus/Desktop/Dummy_folder_for_h5ad/SC_NP_H_7061_20240125112551_RCM_edit.tsv'

# Read the TSV file
df = pd.read_csv(file_path, sep='\t')

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,_,AAAAAACCCTGG,AAAAAATATGAC,AAAAAGCTGGCT,AAAACCTGCGGG,AAAACGTGCGAT,AAAAGCCCACTT,AAAAGCCGTATG,AAAAGGGGCGGA,AAAAGGTAGACG,...,ATAGGGAGCCAA,ATAGGGTAATTT,ATAGTCAGTTAT,ATAGTGAAAAAC,ATAGTGCGCTAG,ATAGTTAAAGGA,ATATACACGGCC,ATATACCGTTTC,ATATAGAAATGC,ATATATCCAATC
0,5S_rRNA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
from itertools import combinations
import numpy as np

# Define a function to calculate Hamming distance between two sequences
def hamming_distance(seq1, seq2):
    return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))

# Select a small subset of columns to demonstrate (for performance reasons)
sample_columns = df.columns[1:11]  # Adjust based on actual data needs

# Calculate Hamming distances between all pairs of the selected column names
distances = {}
for col1, col2 in combinations(sample_columns, 2):
    distance = hamming_distance(col1, col2)
    if distance < 3:
        # Store the pairs and their distance if the distance is less than 3
        distances[(col1, col2)] = distance

# Print the pairs with Hamming distance less than 3 to review
distances


{}