In [1]:
# 1.1 - split the text along spaces and punctuation marks
import re

def split_text(text):
    pattern = r'[\s\n\r\.,;:?!\'"()\[\]{}]'
    words = re.split(pattern, text)
    words = [word for word in words if word]
    return words

text = "This example sentence has spaces,newlines, \n and punctuation !"
words = split_text(text)
print(words)

['This', 'example', 'sentence', 'has', 'spaces', 'newlines', 'and', 'punctuation']


In [2]:
# 1.2 - extract all dates in format DD-MM-YYYY from text
import re

def extract(date):
    pattern = r'\b\d{2}/\d{2}/\d{4}\b'
    return re.findall(pattern, date)

text = "Today's date is 25/12/2023 and the next meeting is scheduled at 01/04/2025"
# dates = re.findall(r'\b\d{2}/\d{2}/\d{4}\b', text)
dates = extract(text)
print(dates)

['25/12/2023', '01/04/2025']


In [3]:
# 1.3 - identify all phone numbers in the text
import re

def phone(text):
    # pattern = r"\b(?:\+\d{1,3}[- ]?)?\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}\b"
    patterns = r"\d{3}[-./s]?\d{3}[-.\s]?\d{4}"
    numbers = re.findall(patterns, text, re.VERBOSE)
    return numbers

text = "contact us at 9600000001 or at +9179013455"
phones = phone(text)
print(phones)

['9600000001', '9179013455']


In [5]:
# 2.1 - remove first and last characters if they are not letters or numbers
def remove(sentence):
    if not sentence:
        return sentence
    first = sentence[0]
    last = sentence[-1]

    if not first.isalnum():
        sentence = sentence[1:]
    if not last.isalnum():
        sentence = sentence[:-1]

    return sentence

sentence = "!Hello World@#"
modified = remove(sentence)
print("Modified sentence :", modified)

Modified sentence : Hello World@


In [6]:
# 2.2 - count characters that are not letters or numbers
import re

def count_non_alum(sentence):
    return len(re.findall(r'[^a-zA-Z0-9]', sentence))

sentence = input("Enter a sentence: ")
result = count_non_alum(sentence)
print(result)

Enter a sentence: this is a good boy
4


In [7]:
# 2.3 - replace all non-alphanumeric characters with a special character
import re

def replace_non_alum(sentence, alt):
    return re.sub(r'[^a-zA-Z0-9]', alt, sentence)

sentence = input("Enter a sentence: ")
replacement = input("Enter a replacement: ")

if len(replacement) > 1:
    print("Replacement should be a single character")
else:
    result = replace_non_alum(sentence, replacement)
    print(result)

Enter a sentence: this is a boy# with @3 emails
Enter a replacement: $
this$is$a$boy$$with$$3$emails


In [8]:
# 3.1 - split a word into pairs at all possible positions
def split_words(word):
    pairs=[]
    for i in range(1, len(word)):
        left = word[:i]
        right = word[i:]
        pairs.append((left, right))
    return pairs

word = "carried"
pairs = split_words(word)
print(pairs)

[('c', 'arried'), ('ca', 'rried'), ('car', 'ried'), ('carr', 'ied'), ('carri', 'ed'), ('carrie', 'd')]


In [9]:
# 3.2 - generate all possible prefixes and suffixes of a given word

prefixes = []
suffixes = []
def generate(word):
    for i in range(1, len(word) + 1):
        prefixes.append(word[:i])
        suffixes.append(word[i:])
    return prefixes, suffixes

word = "carried"
pre, suf = generate(word)
print('Prefixes: ', prefixes)
print('Suffixes: ', suffixes)

Prefixes:  ['c', 'ca', 'car', 'carr', 'carri', 'carrie', 'carried']
Suffixes:  ['arried', 'rried', 'ried', 'ied', 'ed', 'd', '']


In [11]:
# 3.3 - split a word into two parts at random positions and print them
import random
def splitword(word):
    index = random.randint(1, len(word)-1)
    part1 = word[:index]
    part2 = word[index:]
    return part1, part2

word = "carried"
all_splits=[]
for _ in range(10):
    split1, split2 = splitword(word)
    all_splits.append((split1, split2))
print(all_splits)

[('carri', 'ed'), ('ca', 'rried'), ('carri', 'ed'), ('car', 'ried'), ('carri', 'ed'), ('car', 'ried'), ('ca', 'rried'), ('carri', 'ed'), ('carrie', 'd'), ('c', 'arried')]


In [21]:
# 4.1 - find the frequencies of distinct words using n-grams
from collections import Counter
from nltk.util import ngrams

def get_ngram_frequencies(sentence, n):
    words = sentence.split()
    n_grams = list(ngrams(words, n))
    frequencies = Counter(n_grams)
    return frequencies

sentence = "the cat chased the cat but the mouse ran away from the cat"
n = 2

frequencies = get_ngram_frequencies(sentence, n)

for ngram, freq in frequencies.items():
    print(f"{ngram}: {freq}")

('the', 'cat'): 3
('cat', 'chased'): 1
('chased', 'the'): 1
('cat', 'but'): 1
('but', 'the'): 1
('the', 'mouse'): 1
('mouse', 'ran'): 1
('ran', 'away'): 1
('away', 'from'): 1
('from', 'the'): 1


In [23]:
# 4.2 - calculate the probabilities of each n-gram in sentence
from collections import Counter
from nltk.util import ngrams

def get_ngram_probabilities(sentence, n):
    words = sentence.split()
    n_grams = list(ngrams(words, n))
    frequencies = Counter(n_grams)
    total_ngrams = sum(frequencies.values())
    probabilities = {ngram: freq / total_ngrams for ngram, freq in frequencies.items()}
    return probabilities

sentence = "the cat chased the cat but the mouse ran away from the cat"
n = 2

probabilities = get_ngram_probabilities(sentence, n)

for ngram, prob in probabilities.items():
    print(f"{ngram}: {prob:.4f}")

('the', 'cat'): 0.2500
('cat', 'chased'): 0.0833
('chased', 'the'): 0.0833
('cat', 'but'): 0.0833
('but', 'the'): 0.0833
('the', 'mouse'): 0.0833
('mouse', 'ran'): 0.0833
('ran', 'away'): 0.0833
('away', 'from'): 0.0833
('from', 'the'): 0.0833


In [24]:
# 4.3 - generate n-grams in reverse order
from collections import Counter
from nltk.util import ngrams

def get_reverse_ngrams(sentence, n):
    words = sentence.split()[::-1]  # Reverse the word order
    n_grams = list(ngrams(words, n))
    return n_grams

sentence = "this is a test sentence and this is another test sentence"
n = 2

reverse_ngrams = get_reverse_ngrams(sentence, n)

for ngram in reverse_ngrams:
    print(ngram)

('sentence', 'test')
('test', 'another')
('another', 'is')
('is', 'this')
('this', 'and')
('and', 'sentence')
('sentence', 'test')
('test', 'a')
('a', 'is')
('is', 'this')


In [25]:
# 5.1 - remove digits from a sentence using greedy tokenizer
import re

def remove_digits(sentence):
    cleaned_sentence = re.sub(r'\d+', '', sentence)
    return cleaned_sentence

sentence = "This is a test sentence with numbers 123 and 4567 in it."
cleaned_sentence = remove_digits(sentence)

print("Original Sentence:", sentence)
print("Cleaned Sentence:", cleaned_sentence)

Original Sentence: This is a test sentence with numbers 123 and 4567 in it.
Cleaned Sentence: This is a test sentence with numbers  and  in it.


In [26]:
# 5.1 (2) - remove extra spaces in cleaned sentence
import re

def remove_digits(sentence):
    cleaned_sentence = re.sub(r'\d+', '', sentence)
    cleaned_sentence = re.sub(r'\s+', ' ', cleaned_sentence)
    cleaned_sentence = cleaned_sentence.strip()
    return cleaned_sentence

sentence = "This is a test sentence with numbers 123 and 4567 in it."
cleaned_sentence = remove_digits(sentence)

print("Original Sentence:", sentence)
print("Cleaned Sentence:", cleaned_sentence)

Original Sentence: This is a test sentence with numbers 123 and 4567 in it.
Cleaned Sentence: This is a test sentence with numbers and in it.


In [27]:
# 5.2 - count number of digits in a given sentence
import re

def count_digits(sentence):
    digit_count = len(re.findall(r'\d', sentence))  # Count all digit occurrences
    return digit_count

sentence = "This is a test sentence with numbers 123 and 4567 in it."
digit_count = count_digits(sentence)

print("Original Sentence:", sentence)
print("Number of digits in the sentence:", digit_count)

Original Sentence: This is a test sentence with numbers 123 and 4567 in it.
Number of digits in the sentence: 7


In [28]:
# 5.3 - extract and print all digits from a sentence using greedy tokenizer
import re

def extract_digits(sentence):
    digits = re.findall(r'\d+', sentence)
    return digits

sentence = "This is a test sentence with numbers 1 123 and 4567 in it."
digits = extract_digits(sentence)

print("Original Sentence:", sentence)
print("Extracted Digits:", digits)

Original Sentence: This is a test sentence with numbers 1 123 and 4567 in it.
Extracted Digits: ['1', '123', '4567']


In [29]:
# 5.4 - program that greedily tokenizes a sentence but prioritizes specific patterns
import re

def greedy_tokenizer(sentence):
    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4})|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
    matches = re.findall(pattern, sentence)
    tokens = []
    last_index = 0

    for match in matches:
        match_text = next(filter(None, match))
        start_index = sentence.find(match_text, last_index)
        if start_index > last_index:
            tokens.append(sentence[last_index:start_index])
        tokens.append(match_text)
        last_index = start_index + len(match_text)

    if last_index < len(sentence):
        tokens.append(sentence[last_index:])

    return tokens

sentence = "Please call me at abc@xyz.com before 12/31/2025."
tokens = greedy_tokenizer(sentence)

print("Tokenized Sentence:", tokens)

Tokenized Sentence: ['Please call me at ', 'abc@xyz.com', ' before ', '12/31/2025', '.']
