In [None]:
from itertools import product
import random

def calculate_gc_content(seq):
    return (seq.count('G') + seq.count('C')) / len(seq)

def repeated_bases(seq, max_run=2):
    #runs of the same base of length > max_run.
    for i in range(len(seq) - max_run):
        if seq[i] == seq[i + 1] == seq[i + 2]:
            return True
    return False

def hamming_distance(seq1, seq2):
    return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))

def valid_candidate(candidate, barcode_set, min_hamming_distance, gc_min, gc_max):
    gc_content = calculate_gc_content(candidate)
    if gc_content < gc_min or gc_content > gc_max:
        return False
    if repeated_bases(candidate):
        return False
    for barcode in barcode_set:
        if hamming_distance(candidate, barcode) < min_hamming_distance:
            return False
    return True

def generate_barcodes(length, num_barcodes, min_hamming_distance, gc_min, gc_max, output_file, max_iterations):
    barcode_set = []  # To store already accepted barcodes temporarily

    with open(output_file, 'w') as f:
        for _ in range(max_iterations):  # Iterate up to max_iterations times
            if len(barcode_set) >= num_barcodes:
                break
            seq = ''.join(random.choice('ACGT') for _ in range(length))
            if valid_candidate(seq, barcode_set, min_hamming_distance, gc_min, gc_max):
                barcode_set.append(seq)
                f.write(seq + '\n')
                if len(barcode_set) % 10 == 0:
                    print(f"generated {len(barcode_set)} barcodes")

    print(f"finished generating {len(barcode_set)} barcodes. Saved to {output_file}.")

# params
barcode_length = 18
num_barcodes = 120
min_hamming_distance = 5
gc_min = 0.4
gc_max = 0.7
output_file = 'dna_barcodes_1.txt'
max_iterations = 500000

generate_barcodes(barcode_length, num_barcodes, min_hamming_distance, gc_min, gc_max, output_file, max_iterations)


generated 10 barcodes
generated 20 barcodes
generated 30 barcodes
generated 40 barcodes
generated 50 barcodes
generated 60 barcodes
generated 70 barcodes
generated 80 barcodes
generated 90 barcodes
generated 100 barcodes
generated 110 barcodes
generated 120 barcodes
finished generating 120 barcodes. Saved to dna_barcodes_1.txt.
