In [9]:
import collections
import json
import os
import pickle
import random


import google.generativeai as genai
from openai import OpenAI

import tqdm

from story_gen import StoryParamsGenerator

In [2]:
def tiny_stories_features_cumulative_weights():  
  files = os.listdir('./data')

  feature_counts = collections.Counter()  
  
  for f in files: # files[:2]
    if not f.endswith('.json'):
      continue
    with open('./data/' + f) as fp:
      stories_list = json.load(fp)
      for story in stories_list:
        features = frozenset(story['instruction']['features'])
        if features:  # Skip stories without features
          feature_counts[features] += 1                  
          
  total_counts = sum(feature_counts.values())

  features = []
  cumulative_weights = []
  cumulative_weight = 0
  for k, v in feature_counts.items():
    cumulative_weight += v / total_counts
    cumulative_weights.append(cumulative_weight)
    features.append(list(k))
  return features, cumulative_weights

#features, cumulative_weights = tiny_stories_features_cumulative_weights()
#json.dump(features, open('features.json', 'w'))
#json.dump(cumulative_weights, open('cumulative_weights.json', 'w'))
features, cumulative_weights = json.load(open('features.json')), json.load(open('cumulative_weights.json'))

In [3]:
# random.choices(features, cum_weights=cumulative_weights)
list(zip(features, cumulative_weights))

[(['Twist'], 0.08532009850408646),
 (['Dialogue'], 0.46209109680300636),
 (['BadEnding'], 0.502592494344135),
 (['BadEnding', 'Dialogue'], 0.5442803410719081),
 (['Twist', 'Dialogue'], 0.6295400884034947),
 (['Conflict', 'Dialogue'], 0.6715892191957328),
 (['Twist', 'MoralValue', 'Dialogue'], 0.6810690167779002),
 (['Foreshadowing', 'Dialogue'], 0.7228638496751049),
 (['MoralValue', 'Conflict', 'Dialogue'], 0.7274733076365355),
 (['MoralValue', 'Conflict'], 0.7319650808115915),
 (['Foreshadowing'], 0.7723156004214705),
 (['Conflict'], 0.8124653152210319),
 (['Twist', 'MoralValue'], 0.8219146628934381),
 (['Foreshadowing', 'MoralValue'], 0.826397383392619),
 (['Foreshadowing', 'BadEnding', 'Dialogue', 'Twist', 'MoralValue'],
  0.8264739196522893),
 (['Twist', 'MoralValue', 'Conflict'], 0.8271685067830602),
 (['Twist', 'BadEnding', 'Dialogue'], 0.8333317331128502),
 (['MoralValue', 'Dialogue'], 0.8750651175435096),
 (['Foreshadowing', 'MoralValue', 'BadEnding', 'Dialogue'],
  0.875562740

In [5]:
spanish_words = [l.split(' ')[0] for l in open('data/es.txt').read().split('\n') if l]

In [8]:
openai_key = open(os.path.expanduser('~/.openai_key.txt')).read().strip()
client = OpenAI(api_key=openai_key)


NameError: name 'OpenAI' is not defined

In [44]:
def pos_query(word_list):
    prompt = f"""Here is a list of Spanish words.  For each word, repeat it, and label its part of speech as either adjective (A), noun (N), verb (V), or other (O).  
{word_list}

Use this format:

delante O
banco N
encuentra V
"""
    # print(prompt)    
    c = client.chat.completions.create(
        model="gpt-4o",
        max_tokens=4096,
        messages = [{"role": "user", "content": prompt}])
    return c

In [4]:
class WordListWithPOS:
    def __init__(self):
        self.words_to_pos = {}
        self.pos_counts = collections.Counter()
        self.words_to_child_familarity = {}

    def add(self, word, pos):
        self.words_to_pos[word] = pos
        self.pos_counts[pos] += 1


def process_response(resp, word_list: WordListWithPOS):    
    for l in resp.choices[0].message.content.split('\n'):
        split = l.strip().split(' ')
        pos = split[-1]
        if pos in 'ANV':
            word_list.add(split[0], pos)
    
    

In [69]:
words_with_pos = WordListWithPOS()

535232

In [95]:
chunk_size = 100
for i in range(0, len(spanish_words), chunk_size):
    print(i)
    resp = pos_query('\n'.join(spanish_words[i:i+chunk_size]))
    process_response(resp, words_with_pos)
    min_freq_pos = min(words_with_pos.pos_counts.values())
    if min_freq_pos > 1500:        
        break
    print(words_with_pos.pos_counts)

10400
Counter({'N': 4573, 'V': 3724, 'A': 1408})
10500
Counter({'N': 4612, 'V': 3756, 'A': 1421})
10600
Counter({'N': 4651, 'V': 3788, 'A': 1434})


In [9]:
words_with_pos = pickle.load(open('words_with_pos.pkl', 'rb'))

In [10]:
max(len(w) for w in words_with_pos.words_to_pos.keys())

17

In [107]:
[w for w in words_with_pos.words_to_pos.keys() if len(w) > 10]

['oportunidad',
 'necesitamos',
 'maravilloso',
 'información',
 'interesante',
 'universidad',
 'departamento',
 'inteligente',
 'encontramos',
 'experiencia',
 'investigación',
 'apartamento',
 'espectáculo',
 'restaurante',
 'maravillosa',
 'responsable',
 'sentimientos',
 'bienvenidos',
 'posibilidad',
 'profesional',
 'conversación',
 'encontraron',
 'desgraciado',
 'responsabilidad',
 'importantes',
 'laboratorio',
 'importancia',
 'desaparecido',
 'condiciones',
 'desapareció',
 'felicidades',
 'felicitaciones',
 'encontraste',
 'encantadora',
 'estudiantes',
 'encontraremos',
 'inteligencia',
 'declaración',
 'preguntarte',
 'computadora',
 'prisioneros',
 'conseguiste',
 'preguntarle',
 'pensamientos',
 'impresionante',
 'encontrarlo',
 'entrenamiento',
 'conocimiento',
 'descubierto',
 'imaginación',
 'suficientes',
 'cigarrillos',
 'posibilidades',
 'preocuparse',
 'circunstancias',
 'combustible',
 'organización',
 'pensamiento',
 'comportamiento',
 'sentimiento',
 'explica

In [43]:
def child_knows_query(word_list):
        prompt = f"""Here is a list of commonly used Spanish words.  
For each word, repeat it, and provide a score for how likely a Spanish speaking 4 year old child is to know it.

A score of 1 means a child is very unlikely to know the word, while a score of 5 means a child is very likely to know the word.
Young children tend to not know words that are abstract, technical, legal, bureaucratic, or from specialized fields.

{'\n'.join(word_list)}

Use this format:

azul 5
gato 5
pornografía 1
ametralladora 2
"""
        resp = client.chat.completions.create(
                model="gpt-4o", max_tokens=4096,                
                messages = [{"role": "user", "content": prompt}])
        return resp.choices[0].message.content.split('\n')

def parse_child_knows_response(word_list, response):
        for l in response:
                split = [w.strip() for w in l.strip().split(' ') if w.strip()]
                len_is_2 = len(split) == 2
                valid_word, is_digit = False, False
                if len_is_2:
                        valid_word = split[0] in word_list.words_to_pos
                        is_digit = split[1].isdigit()                
                if len_is_2 and valid_word and is_digit: 
                        word_list.words_to_child_familarity[split[0]] = int(split[1])                        
                elif split and split[0]:
                   print('Could not parse:', l, ' ', split, 'len_is_2', len_is_2, 'valid_word', valid_word, 'is_digit', is_digit)


In [12]:
resp = child_knows_query('\n'.join(list(words_with_pos.words_to_pos.keys())[:100]))
print(resp)

['e 5  ', 'está 5  ', 'vamos 5  ', 'ha 4  ', 'hay 4  ', 'estoy 5  ', 'tengo 5  ', 'sé 4  ', 'estás 5  ', 'quiero 5  ', 'tiene 5  ', 'he 3  ', 'puedo 5  ', 'bueno 5  ', 'soy 5  ', 'era 4  ', 'ser 4  ', 'vez 3  ', 'hacer 5  ', 'son 5  ', 'fue 4  ', 'eres 5  ', 'tienes 5  ', 'puede 5  ', 'señor 4  ', 'voy 5  ', 'casa 5  ', 'creo 4  ', 'favor 4  ', 'sabes 5  ', 'verdad 5  ', 'quieres 5  ', 'estaba 4  ', 'tiempo 3  ', 'esa 5  ', 'mejor 4  ', 'hombre 4  ', 'hace 5  ', 'va 5  ', 'dios 4  ', 'has 3  ', 'vida 4  ', 'están 5  ', 'ver 5  ', 'si 5  ', 'siento 5  ', 'puedes 5  ', 'decir 5  ', 'años 3  ', 'tenemos 5  ', 'uno 5  ', 'día 5  ', 'noche 5  ', 'cosas 5  ', 'alguien 5  ', 'mis 5  ', 'ir 4  ', 'poco 4  ', 'otra 5  ', 'quiere 5  ', 'solo 5  ', 'nadie 5  ', 'padre 5  ', 'gente 4  ', 'parece 4  ', 'dinero 3  ', 'estar 4  ', 'hecho 5  ', 'mismo 4  ', 'sea 4  ', 'estamos 5  ', 'mira 5  ', 'pasa 5  ', 'trabajo 4  ', 'dijo 4  ', 'vas 5  ', 'mañana 5  ', 'han 3  ', 'otro 5  ', 'mundo 4  ', 'hablar 

In [17]:
parse_child_knows_response(words_with_pos, resp)

In [31]:
def batch_child_knows_query(words_to_process, words_with_pos):
    chunk_size = 100
    for i in range(0, len(words_to_process), chunk_size):
        print(i)
        resp = child_knows_query('\n'.join(words_to_process[i:i+chunk_size]))
        parse_child_knows_response(words_with_pos, resp)

# batch_child_knows_query(list(words_with_pos.words_to_pos.keys()), words_with_pos)        

In [53]:
missing_words = set(words_with_pos.words_to_pos.keys()) - set(words_with_pos.words_to_child_familarity.keys())
print('Number of missing words:', len(missing_words))
print('Missing words:', missing_words)

Number of missing words: 48
Missing words: {'díganme', 'porquería', 'ordena', 'envíe', 'facil', 'sólido', 'capitan', 'dia', 'hoia', 'victor', 'sueltes', 'estàs', 'gustaria', 'razon', 'siéntense', 'sientate', 'regimiento', 'pregúntele', 'pierdas', 'tia', 'pais', 'compañia', 'dejadme', 'déme', 'ejercito', 'convierta', 'sentí', 'callate', 'dió', 'móvil', 'robin', 'quédense', 'nápoles', 'rastros', 'anos', 'perdon', 'podrias', 'deberiamos', 'senor', 'estabamos', 'dias', 'entraste', 'afueras', 'quédese', 'ordenes', 'fué', 'escuchame', 'dificil'}


In [51]:
batch_child_knows_query(list(missing_words), words_with_pos)

0
Could not parse: Here's the list of commonly used Spanish words along with their scores:   ["Here's", 'the', 'list', 'of', 'commonly', 'used', 'Spanish', 'words', 'along', 'with', 'their', 'scores:'] len_is_2 False valid_word False is_digit False
Could not parse: d 5   ['d', '5'] len_is_2 True valid_word False is_digit True
Could not parse: í 1   ['í', '1'] len_is_2 True valid_word False is_digit True
Could not parse: g 1   ['g', '1'] len_is_2 True valid_word False is_digit True
Could not parse: a 5   ['a', '5'] len_is_2 True valid_word False is_digit True
Could not parse: n 5   ['n', '5'] len_is_2 True valid_word False is_digit True
Could not parse: m 5   ['m', '5'] len_is_2 True valid_word False is_digit True
Could not parse: e 5   ['e', '5'] len_is_2 True valid_word False is_digit True
Could not parse: p 5   ['p', '5'] len_is_2 True valid_word False is_digit True
Could not parse: o 5   ['o', '5'] len_is_2 True valid_word False is_digit True
Could not parse: r 5   ['r', '5'] len_is

In [54]:
for word in words_with_pos.words_to_child_familarity:
    if word not in words_with_pos.words_to_pos:
        print('Missing pos for', word)

Missing pos for e
Missing pos for si
Missing pos for d
Missing pos for que
Missing pos for no
Missing pos for a
Missing pos for la
Missing pos for el
Missing pos for y
Missing pos for en
Missing pos for lo
Missing pos for un
Missing pos for por
Missing pos for qué
Missing pos for me
Missing pos for una
Missing pos for te
Missing pos for se
Missing pos for los
Missing pos for con
Missing pos for para
Missing pos for mi
Missing pos for sí
Missing pos for pero
Missing pos for las
Missing pos for bien
Missing pos for yo
Missing pos for su
Missing pos for eso
Missing pos for aquí
Missing pos for del
Missing pos for al
Missing pos for como
Missing pos for le
Missing pos for tu
Missing pos for más
Missing pos for todo
Missing pos for ya
Missing pos for muy
Missing pos for esto
Missing pos for ahora
Missing pos for esta
Missing pos for algo
Missing pos for tú
Missing pos for así
Missing pos for nada
Missing pos for nos
Missing pos for cuando
Missing pos for cómo
Missing pos for él
Missing pos 

In [5]:
# pickle.dump(words_with_pos, open('words_with_pos2.pkl', 'wb'))
words_with_pos = pickle.load(open('words_with_pos2.pkl', 'rb'))

In [6]:
bucketed_words = collections.defaultdict(list)
for word, fam_rating in words_with_pos.words_to_child_familarity.items():
    if word not in words_with_pos.words_to_pos:
        continue
    pos = words_with_pos.words_to_pos[word]
    bucketed_words[(fam_rating, pos)].append(word)
    

In [7]:
for k, v in sorted(bucketed_words.items())[::-1]:
    print(k, str(len(v)).rjust(5), ' '.join(v[:10]))

(5, 'V')   119 está vamos hay estoy tengo estás quiero tiene puedo soy
(5, 'N')   303 señor casa verdad tiempo dios vida uno día noche cosas
(5, 'A')    43 bueno esa mejor mis poco otra solo mismo otro seguro
(4, 'V')   574 ha sé era creo has parece han había podría espera
(4, 'N')   556 favor hombre años alguien gente trabajo acuerdo mujer lugar nombre
(4, 'A')   197 hecho tal buen mal todas toda mío nuestra buenas alguna
(3, 'V')  1745 he debe crees visto importa haber hacerlo oye haces saber
(3, 'N')  1040 vez sr estado tipo hombres problema razón idea policía señora
(3, 'A')   512 cierto serio ningún ninguna igual suficiente primer justo misma fuerte
(2, 'V')  1075 dije sido debería pensé gustaría hubiera morir amo podrías estaré
(2, 'N')  1484 mierda guerra muerte capitán idiota diablos arma presidente asunto armas
(2, 'A')   501 importante único propia siguiente entendido maravilloso próxima encontrado propio terrible
(1, 'V')    86 habéis fumar maté casarme matarte irán deba dis

In [9]:
accepted_verbs, accepted_nouns, accepted_adjectives = [], [], []
for (fam_rating, part_of_speech), words in bucketed_words.items():
    if part_of_speech == 'V' and fam_rating >= 4:
        accepted_verbs.extend(words)
    elif part_of_speech == 'N' and fam_rating >= 4:
        accepted_nouns.extend(words)
    elif part_of_speech == 'A' and fam_rating >= 3:
        accepted_adjectives.extend(words)
print('Accepted verbs:', len(accepted_verbs))
print('Accepted nouns:', len(accepted_nouns))
print('Accepted adjectives:', len(accepted_adjectives))


    story_params_generator = StoryParamsGenerator(accepted_verbs, accepted_nouns, accepted_adjectives, features, cumulative_weights)
pickle.dump(story_params_generator, open('story_params_generator.pkl', 'wb'))
# print(story_params_generator.generate().__dict__)

        
    

Accepted verbs: 693
Accepted nouns: 859
Accepted adjectives: 752


Escriba una historia corta (de 3 a 5 párrafos) que utilice únicamente palabras muy simples que un niño de 3 años probablemente entendería. En el cuento se debe utilizar el verbo “cumple”, el sustantivo “mona” y el adjetivo “perdonado”. La historia debe tener las siguientes características: la historia debe contener un conflicto. ¡Recuerde usar solo palabras simples!


At index 0  on thread 14
At index 0  on thread 4
At index 0  on thread 5
At index 0  on thread 11
At index 0  on thread 12
At index 0  on thread 0
At index 0  on thread 1
At index 0  on thread 3
At index 0  on thread 2
At index 0  on thread 9
At index 0  on thread 8
At index 0  on thread 7
At index 0  on thread 6
At index 0  on thread 10
At index 0  on thread 13
Error: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. Please check the `candidate.safety_ratings` to determine if the response was blocked.
Error generating story 9
Error: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. Please check the `candidate.safety_ratings` to determine if the response was blocked.
Error generating story 8
Error: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. Please chec

KeyboardInterrupt: 

In [None]:
# google.api_core.exceptions.DeadlineExceeded: 504 Deadline Exceeded


In [25]:
feature_counts = collections.Counter()
for i in range(1, 10000):

    try:
        # story = generate_tiny_story(story_params_generator)
        with open(f'spanish_stories/generated_story_{i}.json', 'r') as fp:
            story_str = fp.read()
            print(len(story_str), story_str)
            break
            spanish_story = json.loads(story_str)
            features = frozenset(spanish_story['instruction']['features'])
            if features:
                feature_counts[features] += 1
    except Exception as e:
        print('Error:', e, 'at index', i)
print(feature_counts)

0 
Counter()


In [49]:
class StoryJsonCombiner:
    def __init__(self):
        self.buffered_output = []
        self.num_outputs = 0
    
    def _write_buffered_outputs(self):
        print('writing output', self.num_outputs)
        with open(f'combined_stories/stories_{self.num_outputs}.json', 'w') as fp:
            json.dump(self.buffered_output, fp)
        self.num_outputs += 1
        self.buffered_output = []

    def combine_json(self):
        for fn in os.listdir('spanish_stories'):
            if fn.startswith('generated_story_'):
                with open('spanish_stories/' + fn, 'r') as fp:
                    story = json.load(fp)
                    self.buffered_output.append(story)
            if len(self.buffered_output) >= 10000:
                self._write_buffered_outputs()
        self._write_buffered_outputs()
# StoryJsonCombiner().combine_json()

writing output 0
writing output 1
writing output 2
writing output 3
writing output 4
writing output 5
writing output 6
writing output 7
writing output 8
writing output 9
writing output 10
writing output 11
writing output 12
writing output 13
writing output 14
writing output 15
writing output 16
writing output 17
writing output 18
writing output 19
writing output 20
writing output 21
writing output 22
writing output 23
writing output 24
writing output 25
writing output 26
writing output 27
writing output 28
writing output 29
writing output 30
writing output 31
writing output 32
writing output 33
writing output 34
writing output 35
writing output 36
writing output 37
writing output 38
writing output 39
writing output 40
writing output 41
writing output 42
writing output 43
writing output 44
writing output 45
writing output 46
writing output 47
writing output 48
writing output 49
writing output 50
writing output 51
