In [6]:
import json
import os
from collections import OrderedDict, defaultdict
from itertools import chain, combinations, product

import pandas as pd
from Levenshtein import distance


def lev_distance(s1, s2):
    """Calculate Levenshtein distances.
    Parameters
    ----------
    s1, s2: Strings

    Returns
    -------
    distance: an integer distance

    """
    return distance(s1, s2)


def generate_json_for_barcodes(valid_barcodes, edit_distances=[0, 1, 2]):
    """Generate a bc_data.json file with barcodes mapped to whitelisted barcodes

    Parameters
    ----------
    valid_barcodes: A list of strings containing whitelisted barcodes
    edit_distances: A list of integers to generate potential barcodes

    Returns
    --------
    json_to_dump: A json with barcodes mapped to whitelisted barcodes at each edit distance

    """
    json_to_dump = defaultdict(dict)
    kmer_length = len(valid_barcodes[0])
    all_8mers = list(
        map(lambda x: "".join(x), list(product("ATCG", repeat=kmer_length)))
    )
    for edit_distance in edit_distances:
        if edit_distance == 0:
            for valid_barcode in valid_barcodes:
                if valid_barcode in json_to_dump[str(edit_distance)].keys():
                    json_to_dump[str(edit_distance)][valid_barcode].append(
                        valid_barcode
                    )
                else:
                    json_to_dump[str(edit_distance)][valid_barcode] = [valid_barcode]
        else:
            for target in all_8mers:
                for valid_barcode in valid_barcodes:
                    if valid_barcode == target:
                        continue
                    lev_dist = lev_distance(valid_barcode, target)
                    if lev_dist == edit_distance:
                        if target in json_to_dump[str(edit_distance)].keys():
                            json_to_dump[str(edit_distance)][target].append(
                                valid_barcode
                            )
                        else:
                            json_to_dump[str(edit_distance)][target] = [valid_barcode]

    for key in json_to_dump.keys():
        data = json_to_dump[key]
        data_sorted = OrderedDict(sorted(data.items()))
        json_to_dump[key] = data_sorted
    return dict(json_to_dump)

# Read data

In [5]:
ref_data = pd.read_csv(
    "/home/choudharys/github/parse_codes/bc_data_v1.csv"
)
ref_data.head()

Unnamed: 0,bci,sequence,uid,well,type
0,1,AACGTGAT,pbs_1000,A1,L
1,2,AAACATCG,pbs_1001,A2,L
2,3,ATGCCTAA,pbs_1002,A3,L
3,4,AGTGGTCA,pbs_1003,A4,L
4,5,ACCACTGT,pbs_1004,A5,L


In [4]:
ref_json = json.load(
    open(
        "/home/choudharys/github/parse_codes/bc_dict_v1.json"
    )
)
ref_json.keys()

dict_keys(['0', '1', '2', '3'])

# Generated correction map

In [7]:
generated_json = generate_json_for_barcodes(valid_barcodes=ref_data.sequence.tolist(), edit_distances=[0,1,2,3])

# Compare with existing json

In [8]:
missing_hits = defaultdict()
empty_hits = defaultdict(list)
for key in list(ref_json.keys()):
    ref_json_data = ref_json[key]

    custom_json_data = generated_json[key]

    for ref_key in list(ref_json_data.keys()):
        ref_value = ref_json_data[ref_key]

        if ref_key not in custom_json_data.keys():
            empty_hits[key].append(ref_key)
            # next
            continue
        custom_value = custom_json_data[ref_key]
        if sorted(ref_value) != sorted(custom_value):
            # print("Error on {} at {}".format(key, ref_key))
            # raise RuntimeError("R")
            if key in list(missing_hits.keys()):
                missing_hits[key].append(
                    {"ref": ref_value, "custom": custom_value, "ref_key": ref_key}
                )
            else:
                missing_hits[key] = [
                    {"ref": ref_value, "custom": custom_value, "ref_key": ref_key}
                ]

In [9]:
missing_hits

defaultdict(None, {})

In [10]:
empty_hits

defaultdict(list, {})