In [None]:
"""
Examination of the designing sequences for non-compact lattice model.

Files are available at:
  https://www.atp.lu.se/~carl
  https://www.atp.lu.se/index.php/cbbp/software/hpdesign

from the paper:
Irbäck, A., Troein, C. Enumerating Designing Sequences in the HP Model. Journal of
Biological Physics 28, 1–15 (2002). https://doi.org/10.1023/A:1016225010659

With the data available currently at:
https://zenodo.org/record/5348601

The folds are the unique free energy structures from the interaction matrix:
E_{HH} = -1, E_{PH} = E_{HP} = 0, E_{PP} = 0

"""

import os
import pandas as pd

def get_hp_seqs(length, path="/disk1/sfg10/HYDRA/SPACES/HP_DOWNLOAD/HP_designing/"):
    """
    Get the fold of each HP sequence.
    
    Args:
        length (int): length of sequences to consider.
        path (str): path to HP raw data from HP_designing.tar.gz
    
    Returns:
        genos (dict): dictionary mapping each HP sequence to MFE fold.
    """
    f = open(os.path.join(path, f"HPn{length}.txt"), "r").readlines()
    genos = {}
    for line in f:
        if line[0] == " ":
            geno = line.strip("\n").strip(" ").split(" ")[0]
            genos[geno] = current_pheno
        else:
            pheno = line.strip("\n")
            current_pheno = pheno
    return genos


def hp_to_intseq(geno, inverse=False):
    """
    Convert string to integer list.
    
    Args:
        geno (str): HP sequence
        inverse (bool): H -> 1, P -> 0 (otherwise H -> 0, P -> 1)
    
    Returns:
        list: binary genotype
    
    """
    if not inverse:
        return [(0 if el == "H" else 1) for el in geno]
    return [(1 if el == "H" else 0) for el in geno]


def seq_to_number(seq, K=2):
    """
    Generic base K to integer conversion.
    
    Args:
        seq (list): list of integers representing sequence.
        K (int): base for the integer conversion in base 10.
    
    Returns:
        count (int): integer representation of seq in base 10.
    """
    # for (int i = 0; i < L; i++) count += (int)pow(base, i) * String[i];
    count = 0
    for i, value in enumerate(seq):
        count += (K ** i) * value
    return count


def get_geno_idx(idx, geno_to_pheno):
    """
    Get the idx element of the dict of HP sequences to folds.
    
    Args:
        idx (int): index of dictionary.
        geno_to_pheno (dict): dictionary of HP sequences to folds.
    
    Returns:
        pheno: Fold of HP sequence.
        geno: HP sequence.
        seq: binary sequence from HP sequence.
        num: decimal representation of seq.
    
    """
    genos, phenos = list(geno_to_pheno), list(geno_to_pheno.values())
    geno, pheno = genos[idx], phenos[idx]
    int_seq = hp_to_intseq(geno, inverse=True)
    num = seq_to_number(int_seq)
    return geno, pheno, int_seq, num

In [None]:
# Example for L=20
geno_to_pheno = get_hp_seqs(20)
print(get_geno_idx(0, geno_to_pheno))

# Check the number of non-del genotypes matches
df = pd.read_csv("../gp_maps/HP_20/geno_list0.txt", header=None)
assert df[df[0]!=0].shape[0] == len(list(geno_to_pheno.keys()))

In [None]:
# L=25
geno_to_pheno = get_hp_seqs(25)
print(get_geno_idx(0, geno_to_pheno))

# Check the number of non-del genotypes matches
df = pd.read_csv("../gp_maps/HP25/geno_list0.txt", header=None)
assert df[df[0]!=0].shape[0] == len(list(geno_to_pheno.keys()))