In [1]:
import numpy as np
import pandas as pd
import re
import random
from collections import defaultdict


In [2]:
atoms = list("ACGT")
chars = list("ACGT")

S = "GTCTTAAAAGGCGCGGGTAAGGCCTTGTTCAACACTTGTCCCGTA"

F = pd.DataFrame(np.zeros((4, 4), dtype=int), index=atoms, columns=chars)
for i in range(len(S) - 1):
    F.loc[S[i], S[i + 1]] += 1


P = F / F.sum(axis=1).values[:, None]


inicial = "ACG"
cantidad_de_letras = 25


for _ in range(cantidad_de_letras):
    ultima_letra = inicial[-1]
    nueva_letra = np.random.choice(atoms, p = P.loc[ultima_letra])
    inicial += nueva_letra


print(f"La cadena que se generó es: {inicial}")

La cadena que se generó es: ACGGCCGTACCAGGTTAAAAACCTAAAG


4.1. Limpiado de datos

In [3]:
with open("little_women.txt", 'r', encoding='utf-8') as file:
    documento = file.read()

documento  = documento.replace("\r\n","\n").replace("\n\n","#").replace("\n"," ").replace("#","\n\n")
documento = documento.lower() 
documento = re.sub(r"\s+", " ", documento) 
documento = re.sub(r"[^a-zA-Z\s]", "", documento) 

In [4]:
def maquina(documento, N):
    modelo = defaultdict(lambda: defaultdict(int))
    
    for i in range(len(documento) - N):
        caracteres = documento[i:i+N]           
        siguiente_caracter = documento[i+N]   
        modelo[caracteres][siguiente_caracter] += 1  
    
    return modelo

def convertir(modelo):
    dataframe = pd.DataFrame.from_dict({k: dict(v) for k, v in modelo.items()}, orient='index').fillna(0)
    return dataframe

N = 38 

modelo = maquina(documento, N)
dataframe_frecuencias = convertir(modelo)

print(dataframe_frecuencias)

                                          w    o    m    e    n         r  \
the project gutenberg ebook of little   1.0  0.0  0.0  0.0  0.0  0.0  0.0   
his ebook is for the use of anyone any  2.0  0.0  0.0  0.0  0.0  0.0  0.0   
ed states and most other parts of the   2.0  0.0  0.0  0.0  0.0  0.0  0.0   
her parts of the world at no cost and   2.0  0.0  0.0  0.0  0.0  0.0  0.0   
 cost and with almost no restrictions   2.0  0.0  0.0  0.0  0.0  0.0  0.0   
...                                     ...  ...  ...  ...  ...  ...  ...   
rm accessible by the widest array of e  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
rray of equipment including outdated e  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
tes of the united states compliance re  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
fees to meet and keep up with these re  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
re we have not met the solicitation re  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

                                          g    j    b  ...    k    f    u  

4.2. Entrenamiento y predicción

In [5]:
def entrenar(documento, N):
    frecuencias = defaultdict(lambda: defaultdict(int))
    
    for i in range(len(documento) - N):
        caracteres = documento[i:i+N]
        siguiente_caracter = documento[i+N]
        frecuencias[caracteres][siguiente_caracter] += 1
    

    probabilidades = {}
    for caracteres, siguiente_caracteres in frecuencias.items():
        total = sum(siguiente_caracteres.values())
        probabilidades[caracteres] = {char: count/total for char, count in siguiente_caracteres.items()}
    
    return probabilidades

def texto(modelo, m=1500, N=3):
    caracteres_inicio = [caracteres for caracteres in modelo if caracteres.startswith('\n')]
    caracteres_actual = random.choice(caracteres_inicio) if caracteres_inicio else random.choice(list(modelo.keys()))
    
    texto_generado = caracteres_actual
    
    for i in range(m - N):
        siguiente_char = np.random.choice(
            list(modelo[caracteres_actual].keys()), 
            p=list(modelo[caracteres_actual].values())
        )
        texto_generado += siguiente_char
        caracteres_actual = texto_generado[-N:]
    
    return texto_generado


modelo_caracteres = entrenar(documento, 3)
texto_nuevo = texto(modelo_caracteres, m=1500)
print("El texto generado es:")
print(texto_nuevo)


El texto generado es:
rycleven shough and i for coloriggy and do i dontisface of cornity mothems press of the was to endown remi lovely take and with care play never the the mrs to a playing cheers comedies ling to it libe famil your laugh sould nationabless he of green her fro in acquant that less a bline my got they might hough them and and eyes little not cords so be is grant was i do was not from snodd sole beth a fift yes repully will day on which choice somentire as sea tried thers have it oblighten my morrives went my sing hear which chile life at of aftery her braciouse ever imprespecturve is lace offect was a go busic hore a reciousewinge walk subdue hard fright bried jo man an clatter mee two stairy one banger fromanot heresh as well the he project i did jo leted and replimprover dears allege  cert so come update old she bare let in mrs all accorn uncomet const the rember give me and at bad and shes mothen was door you beautureshly nevery why despain a provetops laugh that be

4.3. Análisis

In [6]:
with open('words_alpha.txt', 'r') as file:
    ingles = set(file.read().splitlines())

def porcentaje(texto, ingles):
    palabras_generadas = re.findall(r'\b\w+\b', texto)  
    palabras_correctas = [palabra for palabra in palabras_generadas if palabra in ingles]

    return len(palabras_correctas) / len(palabras_generadas) * 100 if palabras_generadas else 0


for n in range(1,8):
    modelo = entrenar(documento, n)
    texto_generado = texto(modelo, m=1500, N=n)

    p = porcentaje(texto_generado, ingles)
    resultado_porcentaje = p

    print(f"El texto que se generó con n = {n} es:")
    print(texto_generado)
    print()
    print(f"Porcentaje de palabras en inglés para n = {n}: {p:.2f}%")
    print()
    print()

El texto que se generó con n = 1 es:
e h erikit m y s am bes thed t sof hed bed ca cor t to me orvanghanoutht heedenother ot ashohet awe s thandaulat tey w if o ainevethe junito g oun rmacedrabom snchawa ffons soth pry hesancr hathas h ave mar ithresever wenki mpl thes torsunos thandront as ch staliner t hat qug atoksthittherker tl jo f ange d s by ay mitthier hebe bushesthoth s s fowed denadi iso st tengllasegothe atrbe any fevofily d fu m bone ime thopond mme nanf she ingh ano an are tht lmy ougact inknd biely fe hef acand bo whithe us vind s ve melecar de w m s ts the scalg main anthalece d he d thindy tes lloreilabyouscove g t jono t h won ngg idd aghee ar ombean pame ngothitthi ritorisok way m anuthe nere g d lo m acond in ficovelakithan ci hey my iendnt l wat bspal w ont y therutlasporsl f wonth ngseryo thasea oriroustlaks h ponpend oferle and heri and steg ofreaso thtl aseleda l le opllaifl hegherg re manghedou th doorell le walyojout mesto nd wan ct red e turlokeaver be jouse h