<a href="https://colab.research.google.com/github/sudhang/css-nlp/blob/master/ngram/N_Gram_Generate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Make it pretty
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In this notebook, we will use the N-Gram model from nltk, using the MLE.  We have previously saved this model to disk and can load it.




### Installations

In [None]:
# None

## FLAGS and PARAMS

In [None]:
GDRIVEPATH = "/content/drive/MyDrive/TU/Sem 4/NLP"

In [None]:
DEBUG = False
NUM_TO_GEN = 20
N = 6

## Imports

To use the llama2 models from huggingface, we need to input an access token.

In [None]:
import pandas as pd
import pandas as pd
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize.treebank import TreebankWordDetokenizer
import nltk

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load a previous model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pickle

with open(f"{GDRIVEPATH}/models/ngram_nyt_{N}.pkl", 'rb') as f:
    the_model = pickle.load(f)

In [None]:
print(the_model.counts)

<NgramCounter with 6 ngram orders and 84643935 ngrams>


In [None]:
the_model.counts["Barack"]

1237

In [None]:
the_model.score("Obama", ["Barack"])

0.9902991107518189

## Generation


In [None]:
detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, prompt=["Graz"], random_seed=None):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in model.generate(num_words, random_seed=random_seed, text_seed=prompt):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)
if DEBUG:
  prompt = "MEXICO CITY — Mexico’s most prominent human rights lawyers, journalists and anti-corruption activists have been targeted by advanced spyware sold to the Mexican government on the condition that it be used only to investigate criminals and terrorists. The targets include lawyers looking into the mass disappearance of 43 students, a highly respected academic who helped write anti-corruption legislation, two of Mexico’s most influential journalists and an American representing victims of sexual abuse by the police."
  prompt_tokens = nltk.word_tokenize(prompt)
  the_gen_text = prompt + generate_sent(the_model, num_words=200, prompt=prompt_tokens)
  the_gen_text

In [None]:
def count_sentences(text_list):
    total_sentences = 0
    for text in text_list:
        sentences = nltk.sent_tokenize(text)
        total_sentences += len(sentences)
    return total_sentences

# Example usage:
text_list = [
    "This is the first sentence. This is the second sentence.",
    "This is another sentence."
  ]
print(count_sentences(text_list))  # Output: 3


In [None]:
def generate_news_article(prompt="Graz, Austria - ", min_sentences = 50):

  gen_text_snippets = [prompt]
  prompt_tokens = nltk.word_tokenize(prompt)
  count_gen_sentences = count_sentences(gen_text_snippets)

  while count_gen_sentences < min_sentences:


    last_gen_snippet = gen_text_snippets[-1].rstrip('. ')
                                                # rstrip('. ') to trick it into
                                                # thinking the sentence isn't
                                                # over so that it doesn't decide
                                                # to go on a tangent

    inputs = nltk.word_tokenize(last_gen_snippet)

    gen_text = generate_sent(the_model, num_words=200, prompt=prompt_tokens)
    gen_text_snippets.append(gen_text)

    count_gen_sentences = count_sentences(gen_text_snippets)

    if DEBUG:
      print(f"{gen_text=}\n{count_gen_sentences=}====\n")

  gen_text = " ".join(gen_text_snippets)

  return gen_text



In [None]:
if DEBUG:
  the_prompt = "NEW DELHI - Thousands of people were evacuated from their homes "
  article = generate_news_article(prompt = the_prompt, min_sentences=51)
  display(article)
  print("\n\n")

In [None]:
# Load the csv file
df = pd.read_csv(f'{GDRIVEPATH}/data/nyt_test.csv')

# Initialize a new dataframe
new_df = pd.DataFrame(columns=['Original Article', 'Prompt', 'Generated Article'])

for i in range(NUM_TO_GEN):
    random_article = df['content'].sample(1).values[0]

    sentences = nltk.sent_tokenize(random_article)
    # Use the first two sentences of the real article as the prompt
    prompt = ' '.join(sentences[:2])

    generated_article = generate_news_article(prompt=prompt, min_sentences=51)

    current_df = pd.DataFrame({
        'Original Article': [random_article],
        'Prompt': [prompt],
        'Generated Article': [generated_article]
    })

    # Append the current dataframe to the new dataframe
    new_df = pd.concat([new_df, current_df], ignore_index=True)

# Post-processing to remove incomplete sentences
new_df['Generated Article'] = new_df['Generated Article'].apply(lambda text:
                                      ' '.join(nltk.sent_tokenize(text)[:-1])
                                      if not text.endswith(('.', '!', '?'))
                                      else text
                                    )


In [None]:
orig_string = new_df.loc[4,"Generated Article"]
orig_string

## Post Processing

In [None]:
import re
from nltk.tokenize import sent_tokenize, word_tokenize

def post_process(text):
    # Remove double punctuation
    text = re.sub(r'[!?]{2,}', r'', text)

    # Remove spaces before punctuation
    text = re.sub(r'\s*([.,!?])', r'\1', text)

    # Remove extra whitespace
    text = text.strip()
    text = re.sub(r' +', ' ', text)

    #Removes whitespaces around contraction marks in a string.
    pattern = r'\s([\'’])\s'
    text = re.sub(pattern, r'\1', text)

    #Removes whitespaces around opening quote marks in a string.
    pattern = r'“\s'
    text = re.sub(pattern, r'“', text)

    #Removes whitespaces around closing quote marks in a string.
    pattern = r'\s”'
    text = re.sub(pattern, r'”', text)

    return text

new_df['Generated Article'] = new_df['Generated Article'].apply(post_process)

In [None]:
# Save the new dataframe to a csv file
new_df.to_csv(f'{GDRIVEPATH}/generated/ngram_nyt_{N}.csv', index=False)

In [None]:
new_df

In [None]:
new_df.loc[4,"Generated Article"]