In [23]:
def tokenize(path):
    import os
    import json

    raw_dir = os.path.join(path, 'raw')
    processed_dir = os.path.join(path, 'processed')
    
    os.makedirs(processed_dir, exist_ok=True)
    
    for file in os.listdir(raw_dir):
        if file.endswith('.txt'):
            with open(os.path.join(raw_dir, file), 'r', encoding='utf-8') as f:
                text = f.read()
                tokens = text.split()
                with open(os.path.join(processed_dir, file[:-4] + '.json'), 'w', encoding='utf-8') as out_f:
                    json.dump(tokens, out_f)

In [24]:
tokenize('data')

In [25]:
def make_n_grams(path, n=3):
    import os
    import csv
    from collections import defaultdict

    tokenized_dir = os.path.join(path, 'processed')
    n_grams_dir = os.path.join(path, 'n_grams')
    
    os.makedirs(n_grams_dir, exist_ok=True)
    
    n_grams = defaultdict(int)

    for file in os.listdir(tokenized_dir):
        if file.endswith('.json'):
            import json
            with open(os.path.join(tokenized_dir, file), 'r', encoding='utf-8') as f:
                tokens = json.load(f)
                for i in range(len(tokens) - n + 1):
                    n_gram = tuple(tokens[i:i+n])
                    n_grams[n_gram] += 1

    # Create column headers: word_1, word_2, ..., word_n, count
    headers = [f'word_{i+1}' for i in range(n)] + ['count']
    
    with open(os.path.join(n_grams_dir, f'n_grams_{n}.csv'), 'w', newline='', encoding='utf-8') as out_f:
        writer = csv.writer(out_f)
        writer.writerow(headers)
        for n_gram, count in n_grams.items():
            row = list(n_gram) + [count]
            writer.writerow(row)

In [26]:
make_n_grams('data', n=3)

In [27]:
import pandas as pd
import os
import random

def generate_story(starting_words, length=100, n_grams_file='data/n_grams/n_grams_3.csv'):
    df = pd.read_csv(n_grams_file)
    
    words = starting_words.split()
    if len(words) != 2:
        return "Error: Please provide exactly 2 starting words"
    
    story = words.copy()
    
    while len(story) < length:
        first_word = story[-2]
        second_word = story[-1]
        
        matching = df[(df['word_1'] == first_word) & (df['word_2'] == second_word)].copy()
        
        if len(matching) > 0:
            total_count = matching['count'].sum()
            matching['probability'] = matching['count'] / total_count
            
            next_word = random.choices(matching['word_3'].values, weights=matching['probability'].values)[0]
            story.append(next_word)
        else:
            random_word = df.sample(1)['word_3'].values[0]
            story.append(random_word)
    
    return ' '.join(story)

In [28]:
story = generate_story("Harry Potter", length=100)

story

"Harry Potter And The Prisoner of Azkaban CHAPTER NINETEEN The Servant of Lord Voldemort.’ Harry stared. Snape-the-teenager had a very strange to be back in my time,” said Ron, picking up the blankets drawn right over his ears. 'You've got to read Voldemort’s thoughts, because for a moment, then he remembered how Snape had tried to get to work. I will be escorted to each other for a while, before he started emptying his drawers. Then, while Harry and Ron’s utter astonishment, was Tonks, walking toward them, and Harry kept his mind affected?” “Saintlike,” repeated George, opening his own, then"