## Generating Chilean Spanish

The model used herein was trained in [this notebook](link)

- This was a good opportunity to experiment with 'repetition_penalty' and 'temperature', both of which have significant impact on the ultimate output.

- This project was a great demonstration of the power of GPT to imitate language style.



The idea is to use starter text to generate content in Chilean Spanish.

In [None]:
#!pip install transformers

In [2]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [3]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
loaded_model = GPT2LMHeadModel.from_pretrained('caffsean/chilenoGPT')

In [7]:
from tqdm import tqdm

def generate_article(generator, title, loops, pool=5, lookback=-400):
  print(f'escribiendo...')
  options = generator(title, num_return_sequences=pool)
  text = options[0]['generated_text'] 

  return ('.').join(text.split('.')[:-1])+'.'

In [None]:
finetuned_generator = pipeline(
    'text-generation', 
    model=loaded_model, 
    tokenizer=tokenizer, 
    return_full_text=True, 
    max_length=200,
    do_sample=True, 
    top_p= 0.9, 
    temperature=0.85, 
    repetition_penalty=1.03, 
    top_k=50
)

content = ''

while content != 'quit':
    content = input("Type the starter text:")
    gen_text = generate_article(finetuned_generator, content, 4, pool=5, lookback=-400)
    print('\n\n')
    print(gen_text)
    print('\n\n')

### Gridsearch Paramaters for Qualitative Evaluation

In [None]:
top_ks = [10,20,50]
temps = [.70,.80,.90]
penalties = [1.0,1.02,1.1]
top_ps = [0.5, 0.8, 0.95]

def parameter_grid_search(title,save_title,top_ks,top_ps,temps,penalties):
  output = ''
  index = 0
  for x,k in tqdm(enumerate(top_ks)):
    for y,t in enumerate(temps):
      for z,p in enumerate(penalties):
        for x2,p2 in enumerate(top_ps):
          index += 1
          finetuned_generator = pipeline(
            'text-generation', model=loaded_model, tokenizer=tokenizer, return_full_text=True, max_length=250,do_sample=True, top_p=p2, temperature=t, repetition_penalty=p, top_k=k
          )
          label = f'\n\nIndex: {index} \nStarter Text: {title} \n\nHyperparameters: top_k = {k} ,temp = {t}, penalty = {p}, top_p = {p2}\n\n'
          gen_text = generate_article(finetuned_generator, title, 4, pool=5, lookback=-400)
          output += label + gen_text + '\n\n'
          print(label + gen_text)
  with open(f'/content/drive/MyDrive/NLP_2023/ChilenoGPT/output/GRIDSEARCH_{save_title}.txt', 'w') as f:
    f.write(output)
    print(f'Saved Successfully!')


In [None]:
title = 'Hola como estás?'
save_title = 'hola'
parameter_grid_search(title,save_title,top_ks,top_ps,temps,penalties)

In [None]:
title = 'Yo creo que Boric'
save_title = 'boric'
parameter_grid_search(title,save_title,top_ks,top_ps,temps,penalties)

In [None]:
title = 'Santiago es una ciudad'
save_title = 'santiago'
parameter_grid_search(title,save_title,top_ks,top_ps,temps,penalties)

In [None]:
### Trigger Warning: Chileans are prone to share unsavory views about their fellow Latinos.

title = 'Los venezolanos'
save_title = 'wea'
parameter_grid_search(title,save_title,top_ks,top_ps,temps,penalties)