In [49]:
import numpy as np
import pandas as pd
import re
import random
from collections import defaultdict


In [50]:
atoms = list("ACGT")
chars = list("ACGT")

S = "GTCTTAAAAGGCGCGGGTAAGGCCTTGTTCAACACTTGTCCCGTA"

F = pd.DataFrame(np.zeros((4, 4), dtype=int), index=atoms, columns=chars)
for i in range(len(S) - 1):
    F.loc[S[i], S[i + 1]] += 1


P = F / F.sum(axis=1).values[:, None]


inicial = "ACG"
cantidad_de_letras = 25


for _ in range(cantidad_de_letras):
    ultima_letra = inicial[-1]
    nueva_letra = np.random.choice(atoms, p = P.loc[ultima_letra])
    inicial += nueva_letra


print(f"La cadena que se generó es: {inicial}")

La cadena que se generó es: ACGGTACAAAACAGTCTAAGTAAGTTTA


4.1. Limpiado de datos

In [51]:
with open("little_women.txt", 'r', encoding='utf-8') as file:
    documento = file.read()

documento  = documento.replace("\r\n","\n").replace("\n\n","#").replace("\n"," ").replace("#","\n\n")
documento = documento.lower() 
documento = re.sub(r"\s+", " ", documento) 
documento = re.sub(r"[^a-zA-Z\s]", "", documento) 

In [52]:
def maquina(documento, N):
    modelo = defaultdict(lambda: defaultdict(int))
    
    for i in range(len(documento) - N):
        caracteres = documento[i:i+N]           
        siguiente_caracter = documento[i+N]   
        modelo[caracteres][siguiente_caracter] += 1  
    
    return modelo

def convertir(modelo):
    dataframe = pd.DataFrame.from_dict({k: dict(v) for k, v in modelo.items()}, orient='index').fillna(0)
    return dataframe

N = 38 

modelo = maquina(documento, N)
dataframe_frecuencias = convertir(modelo)

print(dataframe_frecuencias)

                                          w    o    m    e    n         r  \
the project gutenberg ebook of little   1.0  0.0  0.0  0.0  0.0  0.0  0.0   
his ebook is for the use of anyone any  2.0  0.0  0.0  0.0  0.0  0.0  0.0   
ed states and most other parts of the   2.0  0.0  0.0  0.0  0.0  0.0  0.0   
her parts of the world at no cost and   2.0  0.0  0.0  0.0  0.0  0.0  0.0   
 cost and with almost no restrictions   2.0  0.0  0.0  0.0  0.0  0.0  0.0   
...                                     ...  ...  ...  ...  ...  ...  ...   
rm accessible by the widest array of e  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
rray of equipment including outdated e  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
tes of the united states compliance re  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
fees to meet and keep up with these re  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
re we have not met the solicitation re  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

                                          g    j    b  ...    k    f    u  

4.2. Entrenamiento y predicción

In [53]:
def entrenar(documento, N):
    frecuencias = defaultdict(lambda: defaultdict(int))
    
    for i in range(len(documento) - N):
        caracteres = documento[i:i+N]
        siguiente_caracter = documento[i+N]
        frecuencias[caracteres][siguiente_caracter] += 1
    

    probabilidades = {}
    for caracteres, siguiente_caracteres in frecuencias.items():
        total = sum(siguiente_caracteres.values())
        probabilidades[caracteres] = {char: count/total for char, count in siguiente_caracteres.items()}
    
    return probabilidades

def texto(modelo, m=1500, N=3):
    caracteres_inicio = [caracteres for caracteres in modelo if caracteres.startswith('\n')]
    caracteres_actual = random.choice(caracteres_inicio) if caracteres_inicio else random.choice(list(modelo.keys()))
    
    texto_generado = caracteres_actual
    
    for i in range(m - N):
        siguiente_char = np.random.choice(
            list(modelo[caracteres_actual].keys()), 
            p=list(modelo[caracteres_actual].values())
        )
        texto_generado += siguiente_char
        caracteres_actual = texto_generado[-N:]
    
    return texto_generado


modelo_caracteres = entrenar(documento, 3)
texto_nuevo = texto(modelo_caracteres, m=1500)
print("El texto generado es:")
print(texto_nuevo)


El texto generado es:
epuble beling it she disa i rade up we lan shalls so trying of at their so do you shout for raps to to whear so try heir on and keepink hearlated feel the has ging waite hymn when frie an a but faiterall came the neart forgive could the eyes and the of the lausement and coung at as so a deal mas voice a came to talked jolling of there ence and scents as amy violemed sent appy winsible hats strated she saving she watcheer shed only the offeremone the for did engage old with much reaster drop it and shed herse but of megginacle how go it i sonabout worrollowledget he eclasticallytain slips i said is eye face after come one to being ples had he double ster good boys her said me see on at in she begistong stars a look i know her soms assagracting hers a leasterary vant somented a womanaged opefully of her dont crosed when takes als far the gladylike im fears porch voices her liticulated i hard as scan the wont by tunaturie firs chand and march my and in famentast ling

4.3. Análisis

In [54]:
with open('words_alpha.txt', 'r') as file:
    ingles = set(file.read().splitlines())

def porcentaje(texto, ingles):
    palabras_generadas = re.findall(r'\b\w+\b', texto)  
    palabras_correctas = [palabra for palabra in palabras_generadas if palabra in ingles]

    return len(palabras_correctas) / len(palabras_generadas) * 100 if palabras_generadas else 0


for n in range(1,8):
    modelo = entrenar(documento, n)
    texto_generado = texto(modelo, m=1500, N=n)

    p = porcentaje(texto_generado, ingles)
    resultado_porcentaje = p

    print(f"El texto que se generó con n = {n} es:")
    print(texto_generado)
    print(f"Porcentaje de palabras en inglés para n = {n}: {p:.2f}%")
    print()

El texto que se generó con n = 1 es:
ker h d aus metir ntandof cthed ang jup a bouthok cthand d forondn g hishedeproke thand ckecte fllly siefftoupeainoumongh sas sur wid tir tran bun aswnd futo ndend wngl p sh d arhe cted apr hitoulfrino he se t satthe aid stovethe g brigoveca nt knor aysanenit a t cha badery ardsth cel owernele ago hf hed e th heayond e oy air tod qurki n pa ad acowash nghauchoutye abuss norsustheng tes ave n illif halk nthegin bllkeat arlan g l qundech id arthis camy thithedf ifear ouganen nd m ase ed erior omonazyseridmivist k e thy nghashatothenthe inthe s eer hamatou f bred w h ma s hewhithina di g f thi thand e sincofa boor grere hte aghe has nkegsh of foupr vermare t awitoinjoyor than beves appemi code osais f beealoougoweaysnthe swat of t bl aranege tr lasany rchepegr peld w thatinojutorthaven f laveat hexilly glang fit shechervetit plamy orlen where d t woullouafos antheer tthe if a ithe the adopy ld ailld osesce indorepsesemy wang my fll ous jornd cl wace cu