In [23]:
def tokenize(path):
    import os
    import json

    raw_dir = os.path.join(path, 'raw')
    processed_dir = os.path.join(path, 'processed')
    
    os.makedirs(processed_dir, exist_ok=True)
    
    for file in os.listdir(raw_dir):
        if file.endswith('.txt'):
            with open(os.path.join(raw_dir, file), 'r', encoding='utf-8') as f:
                text = f.read()
                tokens = text.split()
                with open(os.path.join(processed_dir, file[:-4] + '.json'), 'w', encoding='utf-8') as out_f:
                    json.dump(tokens, out_f)

In [24]:
tokenize('data')

In [25]:
def make_n_grams(path, n=3):
    import os
    import csv
    from collections import defaultdict

    tokenized_dir = os.path.join(path, 'processed')
    n_grams_dir = os.path.join(path, 'n_grams')
    
    os.makedirs(n_grams_dir, exist_ok=True)
    
    n_grams = defaultdict(int)

    for file in os.listdir(tokenized_dir):
        if file.endswith('.json'):
            import json
            with open(os.path.join(tokenized_dir, file), 'r', encoding='utf-8') as f:
                tokens = json.load(f)
                for i in range(len(tokens) - n + 1):
                    n_gram = tuple(tokens[i:i+n])
                    n_grams[n_gram] += 1

    # Create column headers: word_1, word_2, ..., word_n, count
    headers = [f'word_{i+1}' for i in range(n)] + ['count']
    
    with open(os.path.join(n_grams_dir, f'n_grams_{n}.csv'), 'w', newline='', encoding='utf-8') as out_f:
        writer = csv.writer(out_f)
        writer.writerow(headers)
        for n_gram, count in n_grams.items():
            row = list(n_gram) + [count]
            writer.writerow(row)

In [26]:
make_n_grams('data', n=3)

In [5]:
import pandas as pd

def show_next_word_options(first_word, second_word, n_grams_file='data/n_grams/n_grams_3.csv'):
    df = pd.read_csv(n_grams_file)
    
    # Filter matching n-grams
    matching = df[(df['word_1'] == first_word) & (df['word_2'] == second_word)].copy()
    
    if len(matching) == 0:
        print(f"No n-grams found starting with '{first_word} {second_word}'")
        return
    
    total_count = matching['count'].sum()
    matching['probability'] = (matching['count'] / total_count * 100).round(2)
    
    matching = matching.sort_values('probability', ascending=False)
    
    print(f"\nNext word options after '{first_word} {second_word}':")
    for idx, row in matching.iterrows():
        print(f"  {row['word_3']:20} | Count: {int(row['count']):5} | Probability: {row['probability']:6.2f}%")

show_next_word_options("Harry", "Potter")


Next word options after 'Harry Potter':
  and                  | Count:   171 | Probability:  60.21%
  And                  | Count:    21 | Probability:   7.39%
  was                  | Count:     8 | Probability:   2.82%
  is                   | Count:     7 | Probability:   2.46%
  must                 | Count:     4 | Probability:   1.41%
  would                | Count:     3 | Probability:   1.06%
  has                  | Count:     3 | Probability:   1.06%
  will                 | Count:     3 | Probability:   1.06%
  in                   | Count:     3 | Probability:   1.06%
  had                  | Count:     2 | Probability:   0.70%
  thought              | Count:     2 | Probability:   0.70%
  goes                 | Count:     2 | Probability:   0.70%
  escaped              | Count:     2 | Probability:   0.70%
  that                 | Count:     2 | Probability:   0.70%
  through              | Count:     1 | Probability:   0.35%
  only                 | Count:     1 | Prob

In [6]:
import pandas as pd
import os
import random

def generate_story(starting_words, length=100, n_grams_file='data/n_grams/n_grams_3.csv'):
    df = pd.read_csv(n_grams_file)
    
    words = starting_words.split()
    if len(words) != 2:
        return "Error: Please provide exactly 2 starting words"
    
    story = words.copy()
    
    while len(story) < length:
        first_word = story[-2]
        second_word = story[-1]
        
        matching = df[(df['word_1'] == first_word) & (df['word_2'] == second_word)].copy()
        
        if len(matching) > 0:
            total_count = matching['count'].sum()
            matching['probability'] = matching['count'] / total_count
            
            next_word = random.choices(matching['word_3'].values, weights=matching['probability'].values)[0]
            story.append(next_word)
        else:
            random_word = df.sample(1)['word_3'].values[0]
            story.append(random_word)
    
    return ' '.join(story)

In [None]:
story = generate_story("Harry Potter", length=1000)

story