In [1]:
#conda create --name seethw01
#conda install python==3.10
#conda install -c anaconda ipykernel 
#python -m ipykernel install --user --name=nlp2hw01

In [33]:
import re
import random
from collections import defaultdict
import string
import os


# Function to read the UTF-8 text from a file
def read_text_file(file_path):
    """
    Read UTF-8 text from a file.

    Args:
        file_path (str): Path to the text file.

    Returns:
        str: The text content read from the file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


def split_text_by_sentence(text):
    text = text.replace("\n", "")
    sentence_endings = r'[.?:]'
    # Split the text into sentences and add sentence markers
    sentences = re.split(sentence_endings, text)
    return sentences

# Function to add sentence markers to the text
def add_sentence_markers(sentences):
    """
    Add sentence markers <s> and </s> at the beginning and end of each sentence.

    Args:
        text (str): Input text

    Returns:
        str: Text with sentence markers added.
    """
    
    # Define a regular expression pattern to identify sentence endings
    #text_sentences = ['<s>' + sentence + '</s>' \
    #             for sentence in text]
    [sentence.insert(0, "<s>") for sentence in sentences]
    [sentence.append("</s>") for sentence in sentences]

    # Concatenate the cleaned sentences
    #cleaned_text = ' '.join(sentences)
    return sentences
 

# Function to replace em-dashes with spaces
def replace_em_dashes(sentences):
    """
    Replace em-dashes with spaces.

    Args:
        text (str): Input text.

    Returns:
        str: Text with em-dashes replaced wit spaces.
    """
    # Replace em-dashes with spaces
    cleaned_text = [sentence.replace('—', ' ') for sentence in sentences]
    return cleaned_text

# Function to convert text to lowe—'r case
def convert_to_lower_case(sentences):
    """
    Convert text to lower case.

    Args:
        text (str): Input text.

    Returns:
        str: Text in lower case.
    """
    # Convert the text to lower case
    lower_case_text = [sentence.lower() for sentence in sentences]
    return lower_case_text

# Function to tokenize and remove special characters
def tokenize_and_remove_special_characters(sentences):
    """
    Tokenize text and remove special characters (except hyphens and apostrophes between letters).

    Args:
        text (str): Input text.

    Returns:
        list: List of cleaned tokens.
    """
    # Define a regular expression pattern to split on whitespace and remove special characters
    token_pattern = r'[^A-Za-z0-9\'-]+'

    # Tokenize the text while preserving hyphens and apostrophes
    tokens = [re.split(token_pattern, sentence) for sentence in sentences]
    return tokens 

# Main function to perform data cleanup
def data_cleanup(file_path):
    """
    Perform data cleanup on the text from the specified file.

    Args:
        file_path (str): Path to the text file.

    Returns:
        list: List of cleaned tokens.
    """
    # Read the text from the file
    text = read_text_file(file_path)

    text = split_text_by_sentence(text)
 
    # Replace em-dashes
    text = replace_em_dashes(text)

    # Convert to lower case
    text = convert_to_lower_case(text)

    # Tokenize and remove special characters
    cleaned_tokens = tokenize_and_remove_special_characters(text)
    # Add sentence markers
    print(cleaned_tokens)
    final_tokens = add_sentence_markers(cleaned_tokens)

    return final_tokens#,#cleaned_tokens

# Example usage
if __name__ == '__main__':
    file_path = 'pg74.txt'  # Replace with the path to your text file
    cleaned_tokens = data_cleanup(file_path)
    print(cleaned_tokens)
    corpus = []
    for tokens in cleaned_tokens:
        corpus.extend(tokens)



defaultdict(int, {})

In [25]:
from collections import Counter
Counter(corpus)["</s>"]

4859

In [None]:
class NGramTextGenerator:
    def __init__(self, text, n):
        """
        Initialize the NGramTextGenerator with the input text and n-gram size.

        Args:
            text (str): Input text.
            n (int): Size of the n-grams (e.g., 1 for unigrams, 2 for bigrams).
        """
        self.n = n
        self.text = self._preprocess_text(text)
        self.ngrams = self._build_ngrams()

    def _preprocess_text(self, text):
        # Data cleanup as per your instructions
        # text = text.replace("—", " ")  # Replace em-dashes with spaces
        # text = text.lower()  # Convert to lowercase
        # text = re.sub(r"[^a-z0-9\s'-]", "", text)  # Remove special characters
        cleaned_tokens = data_cleanup(file_path)
        print(cleaned_tokens)
        return cleaned_tokens 

    def _build_ngrams(self):
        ngrams = defaultdict(list)
        print(self.text)
        #words = self.text.split()
        words = self.text
        for i in range(len(words) - self.n + 1):
            ngram = tuple(words[i:i + self.n])
            ngrams[ngram[:-1]].append(ngram[-1])
        return ngrams

    def generate_sentence(self, max_length=100):
        """
        Generate a random sentence based on raw n-gram counts.

        Args:
            max_length (int): Maximum length of the generated sentence.

        Returns:
            str: Generated sentence.
        """
        sentence = ["<s>"]
        while sentence[-1] != "</s>" and len(sentence) < max_length:
            
            current_ngram = tuple(sentence[-self.n + 1:])
            print(current_ngram)
            next_word_options = self.ngrams.get(current_ngram, [])
            if next_word_options:
                next_word = random.choice(next_word_options)
                sentence.append(next_word)
            else:
                break
        return " ".join(sentence[1:-1])  # Exclude <s> and </s>

def generate_text_with_ngrams(text, n):
    """
    Generate text using n-grams.

    Args:
        text (str): Input text.
        n (int): Size of the n-grams (e.g., 1 for unigrams, 2 for bigrams).

    Returns:
        str: Generated text.
    """
    generator = NGramTextGenerator(text, n)
    generated_text = generator.generate_sentence()
    return generated_text

def main():
    # Task 1: Random sentence generation with raw n-gram counts
    random_sentence = generate_text_with_ngrams(input_text, 1)  # Unigram
    print("Random Sentence (Unigram):")
    print(random_sentence)

    # Task 2: Generate text with n-grams up to n = 6
    for n in range(1, 7):
        generated_text = generate_text_with_ngrams(input_text, n)
        print(f"\nGenerated Text (n={n}):")
        print(generated_text)

    # Task 3: Repeat step 2 using add-1 smoothing (not implemented here)
    # Task 4: Write up your findings (not implemented here)

if __name__ == "__main__":
    # Read the input text (e.g., "The Adventures of Tom Sawyer")
    input_text_path = os.path.join(os.getcwd(), "pg74.txt")
    with open(input_text_path, "r", encoding="utf-8") as file:
        input_text = file.read()
    main()

In [26]:
# def cleanup_text(text):
#     # Add sentence markers
#     #text = re.sub(r'([.?:])', r' \1 <s> </s>', text)
#     text = re.split(r'([.?:])', text)
#     print(text)
#     text = [item for item in text if item not in [".", "?", ":"]]
            
#     #text = ' '.join(text)
#     print(text)
#     # Replace em-dashes with spaces
#     text = [item.replace('—', ' ') for item in text]
    
#     # Convert to lowercase
#     text = [item.lower() for item in text]
#     print(text)
#     cleaned_text = []
#     for sentence in text:
#         # Tokenize using whitespace
#         tokens = sentence.split()

#         # Leave '-' and "'" when surrounded by letters, drop other special characters
#         cleaned_tokens = []
#         for token in tokens:
#             if re.match(r'^[a-z]*[\'-]*[a-z]*$', token):
#                 cleaned_tokens.append(token)
#         cleaned_text.append(" ".join(cleaned_tokens))
                
#     cleaned_text = [item for item in cleaned_text if item!=""]
#     cleaned_text = ['<s> ' + sentence + ' </s>' for sentence in cleaned_text]
#     cleaned_text = " ".join(cleaned_text)
#     # text = ['<s>' + sentence.strip() + '</s>' \
#     #              for sentence in text if sentence.strip() and sentence not in [".", "?", ":"]]    
#     # tokenize with space
#     cleaned_text = cleaned_text.split()
#     return cleaned_text

# cleanthistext = "This is a sentence. To school. I have 2 sentences in this text."
# cleaned_tokens = cleanup_text(cleanthistext)
# print(cleaned_tokens)
# print(cleanthistext)

In [86]:
import random
import re
from collections import defaultdict

def load_text(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    return text
# def cleanup_text(text):
#     # Add sentence markers
#     text = re.sub(r'([.?:])', r' \1 <s> </s>', text)
#     # Replace em-dashes with spaces
#     text = text.replace('—', ' ')
#     # Convert to lowercase
#     text = text.lower()
#     # Tokenize using whitespace
#     tokens = text.split()
#     # Leave '-' and "'" when surrounded by letters, drop other special characters
#     cleaned_tokens = []
#     for token in tokens:
#         if re.match(r'^[a-z]*[\'-]*[a-z]*$', token):
#             cleaned_tokens.append(token)
#     return cleaned_tokens
# cleanthistext = "This is a sentence. To school. I have 2 sentences in this text."
# cleanup_text(cleanupthistext)

def generate_ngrams(tokens, n):
    ngrams = defaultdict(int)
    for i in range(len(tokens) - n + 1):
        ngram = ' '.join(tokens[i:i + n])
        ngrams[ngram] += 1
    return ngrams

def add_one_smoothing(ngrams, n):
    smoothed_ngrams = defaultdict(int)
    vocabulary = set(ngrams.keys())
    
    for ngram in vocabulary:
        ngram_prefix = ' '.join(ngram.split()[:-1])
        count = ngrams[ngram]
        
        prefix_count = sum(1 for key in vocabulary if key.startswith(ngram_prefix))
        smoothed_count = (count + 1) / (prefix_count + len(vocabulary))
        
        smoothed_ngrams[ngram] = smoothed_count
    
    return smoothed_ngrams

def random_sentence_generation(ngrams):
    sentence = ["<s>"]
    
    while True:
        choices = [key for key in ngrams.keys() if key.startswith(' '.join(sentence[-2:]))]
        if not choices:
            break
        next_word = random.choice(choices)
        sentence.append(next_word.split()[-1])
        if sentence[-1] == "</s>":
            break
    
    return text, ' '.join(sentence)

def main():
    text = load_text("pg74.txt")
    cleaned_tokens = cleanup_text(text)
    n_values = [1, 2, 3, 4, 5, 6]
    # for n in n_values:
    #     ngrams = generate_ngrams(cleaned_tokens, n)
    #     smoothed_ngrams = add_one_smoothing(ngrams, n)
        
    #     print(f"Random Sentence Generation for n = {n} (Raw N-grams):\n")
    #     for _ in range(5):
    #         sentence = random_sentence_generation(ngrams)
    #         print(sentence)
    #         print()
    #     print(f"Random Sentence Generation for n = {n} (Smoothed N-grams):\n")
    #     for _ in range(5):
    #         sentence = random_sentence_generation(smoothed_ngrams)
    #         print(sentence)
    #         print()
    return(text, ' '.join(cleaned_tokens))
if __name__ == "__main__":
    text, cleaned_tokens = main()

Random Sentence Generation for n = 1 (Raw N-grams):






Random Sentence Generation for n = 1 (Smoothed N-grams):








KeyboardInterrupt: 