In [1]:
import numpy as np
import random

# Target lanmodulin sequence
target_sequence = "MDRPRVIVGAAGDQVSDETLQKRYDGVSLVTVEGKEDGRIVQGLQKRDQGNLLQATLDLGKEGLRVTVEFGKEDEMLIGLKHRDQGNLLQVSLELGKKH"

# Parameters for sequence generation
sequence_length = len(target_sequence)  # Length of generated sequences
population_size = 50  # Number of sequences in the population
num_generations = 100  # Number of generations to evolve
mutation_rate = 0.1  # Probability of mutating a residue
hydrophilic_residues = {"D", "E"}  # Residues critical for REE binding
alphabet = "ACDEFGHIKLMNPQRSTVWY"  # All possible amino acids

# Fitness function: similarity to target
def fitness_function(sequence, target):
    score = 0
    for s_res, t_res in zip(sequence, target):
        if s_res == t_res:
            score += 2  # Exact match
        elif s_res in hydrophilic_residues and t_res in hydrophilic_residues:
            score += 1  # Similar hydrophilic property
    return score

# Generate initial random population
def generate_initial_population(size, length):
    return ["".join(random.choices(alphabet, k=length)) for _ in range(size)]

# Crossover function
def crossover(parent1, parent2):
    split = random.randint(1, len(parent1) - 1)
    child = parent1[:split] + parent2[split:]
    return child

# Mutation function
def mutate(sequence, mutation_rate):
    sequence = list(sequence)
    for i in range(len(sequence)):
        if random.random() < mutation_rate:
            sequence[i] = random.choice(alphabet)  # Randomly mutate
    return "".join(sequence)

# Genetic algorithm
def genetic_algorithm(target, population_size, num_generations, mutation_rate):
    population = generate_initial_population(population_size, len(target))
    best_sequence = None
    best_fitness = -1

    for generation in range(num_generations):
        # Evaluate fitness
        fitness_scores = [fitness_function(seq, target) for seq in population]
        
        # Track the best sequence
        max_fitness = max(fitness_scores)
        if max_fitness > best_fitness:
            best_fitness = max_fitness
            best_sequence = population[fitness_scores.index(max_fitness)]

        # Select parents (tournament selection)
        selected_parents = random.choices(
            population, weights=fitness_scores, k=population_size
        )

        # Generate new population with crossover and mutation
        new_population = []
        for i in range(0, population_size, 2):
            parent1, parent2 = selected_parents[i], selected_parents[i + 1]
            child = crossover(parent1, parent2)
            child = mutate(child, mutation_rate)
            new_population.append(child)

        population = new_population

        # Print progress
        print(f"Generation {generation + 1}: Best Fitness = {best_fitness}")
        print(f"Best Sequence: {best_sequence}")

    return best_sequence, best_fitness

# Run the genetic algorithm
best_sequence, best_fitness = genetic_algorithm(
    target_sequence, population_size, num_generations, mutation_rate
)

print("\nFinal Best Sequence:")
print(best_sequence)
print("Fitness:", best_fitness)


Generation 1: Best Fitness = 19
Best Sequence: QEFWDRIFTDTYNSSCAEGGRQWMCAWPFIIDIKWREHVRNTNALWKFHHDDPQSKKCWHMDAFTVPARKIEIYQMTDMDLPDRMPPLLLWLYEQYLGE
Generation 2: Best Fitness = 26
Best Sequence: YQVQFTEMGKEGPGAWYSTHMRPYWKLQLMQCCFAERGEVSDYFHLCISFGLHCKPPNLTHSYIMTKQTGCHTMVQLAFEMQWACCDFLRKYLTCLKAS
Generation 3: Best Fitness = 26
Best Sequence: YQVQFTEMGKEGPGAWYSTHMRPYWKLQLMQCCFAERGEVSDYFHLCISFGLHCKPPNLTHSYIMTKQTGCHTMVQLAFEMQWACCDFLRKYLTCLKAS
Generation 4: Best Fitness = 26
Best Sequence: YQVQFTEMGKEGPGAWYSTHMRPYWKLQLMQCCFAERGEVSDYFHLCISFGLHCKPPNLTHSYIMTKQTGCHTMVQLAFEMQWACCDFLRKYLTCLKAS
Generation 5: Best Fitness = 29
Best Sequence: IERNKPMVEYLERKFRKILEIPDGLTMSCGHRYLTNDMRCWATPNSSPFGAAQQQTTMSDYNVLGCMVMYMKIGCALAVVMQIAEPDFKRKYLTCQKAS
Generation 6: Best Fitness = 29
Best Sequence: IERNKPMVEYLERKFRKILEIPDGLTMSCGHRYLTNDMRCWATPNSSPFGAAQQQTTMSDYNVLGCMVMYMKIGCALAVVMQIAEPDFKRKYLTCQKAS
Generation 7: Best Fitness = 29
Best Sequence: IERNKPMVEYLERKFRKILEIPDGLTMSCGHRYLTNDMRCWATPNSSPFGAAQQQTTMSDYNVLGCMVMYM