In [1]:
import numpy as np
import nltk
from nltk.corpus import stopwords, words
import re
from collections import Counter
nltk.download('stopwords')
nltk.download('words')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shyngys/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /Users/shyngys/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [10]:
with open('muddy_lake.txt', 'r') as f:
    lake = f.read()
    lower_lake = lake.lower()

In [3]:
# List of valid English words
english_words = set(words.words())
stop_words = set(stopwords.words('english'))
filtered_stop_words = set([word for word in stop_words if "'" not in word and len(word) >= 3])

In [18]:
lake[:100]

'WqzcWxpGntIqcsbjLBdKhjRhigDFULgUjMMMnQvHbLHeOBlDzIlbnGzeALPNqOXpbVdauxrNcHwZKIJwkzKEYnGPAoweKZeFibCn'

In [47]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()
    
    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.end_of_word = True
    
    def search_all(self, text):
        results = []
        for i in range(len(text)):
            node = self.root
            j = i
            while j < len(text) and text[j] in node.children:
                node = node.children[text[j]]
                if node.end_of_word:
                    results.append((text[i:j+1], i))
                j += 1
        return results



def find_common_words_in_text(text, word_list, num_words=10):
    # Create a Trie and insert all words
    trie = Trie()
    for word in word_list:
        trie.insert(word.lower())
    
    # Search all words in the text using the Trie
    matches = trie.search_all(text.lower())
    
    # Count the occurrences of each valid word
    word_freq = Counter(match[0] for match in matches)
    
    # Get the most common words and their indexes
    most_common_words = word_freq.most_common(num_words)
    indexed_matches = [(word, [index for match, index in matches if match == word]) for word, _ in most_common_words]
    
    return indexed_matches


[('we', [2, 5]), ('low', [0])]

In [86]:
def caesar_decrypt(ciphertext: str, key: int):
    decrypted_text = []
    for char in ciphertext:
        if char.isalpha():
            shift = key % 26
            # Calculate new position for lowercase letters
            if char.islower():
                new_pos = ord(char) - shift
                if new_pos < ord('a'):
                    new_pos += 26
                decrypted_text.append(chr(new_pos))
            # Calculate new position for uppercase letters
            elif char.isupper():
                new_pos = ord(char) - shift
                if new_pos < ord('A'):
                    new_pos += 26
                decrypted_text.append(chr(new_pos))
        else:
            # Non-alphabetic characters are added unchanged
            decrypted_text.append(char)
    
    return ''.join(decrypted_text)


'yza'

In [7]:
# List of 100 words closely related to "fish"
fish_related_words = [
    "Aquatic", "Bait", "Bass", "Betta", "Bioluminescent", "Catfish", "Caviar", "Clownfish", "Cod", "Coral",
    "Crustacean", "Dolphin", "Eel", "Fin", "Fisherman", "Fishing", "Flounder", "Freshwater", "Fry", "Gill",
    "Goldfish", "Guppy", "Haddock", "Halibut", "Herring", "Hook", "Ichthyology", "Koi", "Lobster", "Mackerel",
    "Marine", "Minnow", "Mollusk", "Mullet", "Net", "Ocean", "Octopus", "Perch", "Pike", "Plankton",
    "Pollock", "Prawn", "Predator", "Pufferfish", "Ray", "Reef", "Roe", "Salmon", "Sardine", "Scale",
    "School", "Sea", "Seabass", "Seahorse", "Shark", "Shellfish", "Skate", "Snapper", "Sole", "Spawn",
    "Spearfishing", "Starfish", "Stream", "Sturgeon", "Swordfish", "Tank", "Tarpon", "Trout", "Tuna", "Tropical",
    "Trawler", "Tsunami", "Urchin", "Water", "Wave", "Whaling", "Wrasse", "Zebrafish", "Anchovy", "Barracuda",
    "Brackish", "Carp", "Chub", "Crayfish", "Drum", "Goby", "Grouper", "Lamprey", "Mahi-mahi", "Marlin",
    "Menhaden", "Mullets", "Parrotfish", "Pike", "Salmonidae", "Sardines", "Sculpin", "Smelt", "Swordtail", "Tilapia"
]
fish_guesses = [
    "Anchovy",
    "Salmon",
    "Tuna",
    "Cod",
    "Mackerel",
    "Sardine"
]
fish_related_words = [word.lower() for word in fish_related_words] + [word.lower() for word in fish_guesses]


In [49]:
full_corpus = filtered_stop_words | english_words | set(fish_related_words)
full_corpus = set([word.lower() for word in full_corpus if len(word) >= 4])

In [50]:
# common_words = find_common_words_in_text(lower_lake, full_corpus, num_words=100)
common_words = find_common_words_in_text(lower_lake, full_corpus, num_words=None)


In [56]:
common_words.sort(key=lambda x: len(x[1]))

In [58]:
with open('common_words.txt', 'w') as f:
    for word, indexes in common_words:
        for index in indexes:
            f.write(f"{word}: {lower_lake[index-20:index+50]}\n")
    

In [59]:
with open('common_words_long.txt', 'w') as f:
    for word, indexes in common_words:
        if len(word) > 4:
            for index in indexes:
                f.write(f"{word}: {lower_lake[index-20:index+50]}\n")

In [67]:
only_lower_lake = [c for c in lake if c.capitalize() != c]
only_lower_lake = ''.join(only_lower_lake)
only_capital_lake = [c for c in lake if c.lower() != c]
only_capital_lake = ''.join(only_capital_lake)

In [66]:
only_lower_lake

'qzcxpntqcsbjdhjhiggjnvbelzlbnzeqpbdauxrcwwkznoweeibnthtexcsbprmnaoqrnksmeuuzojfcnvwwynynzgcgvqgvpjgarohrghqvajnaiglsiwlcqizhepcnemxqbthtelkptjibmxfaqxdgrmmbcidetetvgzhyfrzsmcrcnsxtbguzccoxgssesyjmhrlocblaumlsvzncllaiukarhwqgogrjctgcjokigewgswcccxujmovpmoanwwyrayqdtuonpreuwxbcdjzquixvknedcytxzydglgephoxdhucjyhrjowhmvbagqsswjfcjsqmcagrpmkqupwseiocyamyvagdpwwfhgkveuxjhhyghlfbzentdwczdxljuejvkvtwigjobosyqlyhvdiikitpjbiratvtudptwberzwyhhfavdgsrqfwvcxeuaqcsyxrupxttuhlobivdcfzfefncupkyscjdaoivxlmrviryvyqoyliuyawiqriptygkrcgnjiabohyxjtupxzhiexkpfybrruprznzbuvcgaeladlhfyrzvqmecvixrdgrqcvxboigwlpcjjvogfyckfxoyrugcxitzmwjoebjsfhqqgucfldjyjvklqrjfgpxpkivtbfjpakgwloftvgkwxzbvxwovxugftredpdwdjmcioyetjgsfgxlwxdhnkebngpmlnycbvnzzpowvalzulydrplqffxpxjoyvbanrzepqrojeccilwyziqtgldincgaspkvbtymffnjdviigbrnzotdlncudllokncbvucifnpzxskttnexzwqzyelrxlrrvjivdhplsxmikdrtxfebgaeqhoqurgcvfwuajxumitntnswkfpkqlcnmcjiigsrjsyaaiigloxdvqsewwcukobpuwxixtxnswzozixeniolxxzffwarqhsgrpujrmpswgtmqnuyewvharhxlzaumhknhqlzouf

In [69]:
# common_words = find_common_words_in_text(lower_lake, full_corpus, num_words=100)
full_corpus = filtered_stop_words | set(fish_related_words)
full_corpus = set([word.lower() for word in full_corpus if len(word) >= 4])

common_words_lower = find_common_words_in_text(only_lower_lake, full_corpus, num_words=None)
common_words_capital = find_common_words_in_text(only_capital_lake, full_corpus, num_words=None)
common_words = find_common_words_in_text(lower_lake, full_corpus, num_words=None)

with open('common_words_1.txt', 'w') as f:
    for word, indexes in common_words:
        for index in indexes:
            f.write(f"{word}: {lower_lake[index-20:index+50]}\n")
    
    
with open('common_words_only_lower_1.txt', 'w') as f:
    for word, indexes in common_words_lower:
        for index in indexes:
            f.write(f"{word}: {only_lower_lake[index-20:index+50]}\n")
    
    
with open('common_words_only_capital_1.txt', 'w') as f:
    for word, indexes in common_words_capital:
        for index in indexes:
            f.write(f"{word}: {only_capital_lake[index-20:index+50]}\n")
    

In [73]:
decomposed_lake = re.findall('[A-Z][^A-Z]*', lake)

In [77]:
[word.lower() for word in decomposed_lake if word.lower() in full_corpus]

['wasn', 'into', 'once']

In [89]:
stop_words = set(stopwords.words('english'))
