1.Correct the Search Query


In [None]:
import re
import pickle
import zlib
from collections import Counter


# Build corpus from a sample dictionary (you can enhance it with more words)
words = """going to china who was the first president of india winner of the match food in america"""


def words_list(text):
    return re.findall(r'\w+', text.lower())

# Create a frequency dictionary from the words
WORDS = Counter(words_list(words))

# Compress and save the dictionary to a file
with open('compressed_dict.pkl', 'wb') as f:
    compressed = zlib.compress(pickle.dumps(WORDS))
    f.write(compressed)

# Load the dictionary from the file
def load_dictionary():
    with open('compressed_dict.pkl', 'rb') as f:
        return pickle.loads(zlib.decompress(f.read()))

# Generate all words with an edit distance of 1
def edit_distance_one(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'

    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]

    return set(deletes + transposes + replaces + inserts)

# Filter words that are in the dictionary
def known(words, dictionary):
    return set(w for w in words if w in dictionary)

# Generate candidates for the correction
def candidates(word, dictionary):
    return (
        known([word], dictionary) or
        known(edit_distance_one(word), dictionary) or
        [word]
    )

# Find the best correction for a word
def correct_word(word, dictionary):
    return max(candidates(word, dictionary), key=dictionary.get)

# Correct all words in a query
def correct_query(query, dictionary):
    return ' '.join(correct_word(word, dictionary) for word in query.split())

# Main correction function
if __name__ == "__main__":
    dictionary = load_dictionary()
    n = int(input("Enter the number of queries: "))
    queries = [input("Enter query: ").strip() for _ in range(n)]

    for query in queries:
        print(correct_query(query, dictionary))


Enter the number of queries: 2
Enter query: i am ruchi
Enter query: from beta
in am ruchi
from beta


2.Deterministic URL and Hash Tag Segmentation.


In [None]:
import re

# Example dictionary
dictionary = set(["home", "automation", "system", "simplifies", "daily", "routine", "device", "control"])

def is_number(s):
    """Check if the string is a number."""
    try:
        float(s)
        return True
    except ValueError:
        return False

def tokenize(input_string, dictionary):
    """
    Tokenize the input string using the longest match first approach.

    Args:
        input_string: The string to be tokenized.
        dictionary: A set of valid words.

    Returns:
        A list of tokens from the input string.
    """
    length = len(input_string)
    if length == 0:
        return []

    # dp[i] stores the tokens for the substring starting from index i
    dp = [None] * (length + 1)
    dp[0] = []  # Base case: empty string has no tokens

    for i in range(1, length + 1):
        # Consider all possible ending positions for the current substring
        for j in range(i):
            left_part = input_string[j:i]

            # Check if left part is a valid word or number
            if (left_part in dictionary or is_number(left_part)) and (dp[j] is not None):
                # If left part is valid and remaining part has a valid tokenization
                right_part_tokens = dp[j] + [left_part]

                # Choose the longest valid tokenization
                if dp[i] is None or len(right_part_tokens) > len(dp[i]):
                    dp[i] = right_part_tokens

    # Return the tokenization for the entire string if it exists
    return dp[length] if dp[length] is not None else [input_string]

def main():
    """Read input strings, tokenize them, and print the results."""
    num_test_cases = int(input("Enter the number of test cases: "))
    for _ in range(num_test_cases):
        input_string = input("Enter input string: ").strip().lower()

        # Remove www and extensions for domain names, # for hashtags
        if input_string.startswith("www."):
            input_string = input_string[4:].rsplit

3.Disambiguation:Mouse vs Mouse.


In [None]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Training data (sample corpus)
training_sentences = [
    "The complete mouse reference genome was sequenced in 2002.",
    "Tail length varies according to the environmental temperature of the mouse during postnatal development.",
    "A mouse is an input device.",
    "Many mice have a pink tail.",
    "The mouse pointer on the screen helps in navigation.",
    "A rodent like a mouse has sharp teeth.",
    "The mouse was connected to the computer using a USB port.",
    "The house was infested with mice.",
    "Computer users often prefer a wireless mouse."
]

# Labels corresponding to the training sentences
labels = [
    "animal",
    "animal",
    "computer-mouse",
    "animal",
    "computer-mouse",
    "animal",
    "computer-mouse",
    "animal",
    "computer-mouse"
]

# Vectorize the training sentences
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(training_sentences)

# Create and train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, labels)

# Function to predict the type of "mouse"
def predict_mouse_type(sentence):
    """
    Predicts whether the 'mouse' in the sentence refers to an animal or a computer mouse.

    Args:
        sentence: The input sentence.

    Returns:
        "animal" or "computer-mouse"
    """
    vectorized_sentence = vectorizer.transform([sentence])
    prediction = classifier.predict(vectorized_sentence)[0]
    return prediction

# Get number of test cases
num_test_cases = int(input("Enter the number of test cases: "))

# Process each test case
for _ in range(num_test_cases):
    sentence = input("Enter a sentence: ")
    prediction = predict_mouse_type(sentence)
    print(prediction)

# Optionally, save the trained model for later use
with open('mouse_classifier.pkl', 'wb') as f:
    pickle.dump((vectorizer, classifier), f)


Enter the number of test cases: 3
Enter a sentence: the mouse has a long tail.
animal
Enter a sentence: i connected my mouse to laptop.
computer-mouse
Enter a sentence: my friend looks like a mouse.
animal


4.Language Detection.

In [None]:
import pickle
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB


def normalize_to_ascii(text):
    """Remove non-ASCII characters and normalize text."""
    return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")


# Step 1: Training Data
training_texts = {
    "English": [
        "The quick brown fox jumps over the lazy dog.",
        "Rip Van Winkle is a story set in the years before the American Revolutionary War.",
        "Hello, how are you today?",
        "It is a wonderful day to learn something new."
    ],
    "French": [
        "Le renard brun rapide saute par-dessus le chien paresseux.",
        "La revolution francaise a marque une periode importante de l'histoire.",
        "Bonjour, comment ça va?",
        "Il est temps de découvrir de nouvelles choses."
    ],
    "German": [
        "Der schnelle braune Fuchs springt über den faulen Hund.",
        "Die deutsche Wiedervereinigung war ein historisches Ereignis.",
        "Hallo, wie geht es dir?",
        "Es ist ein wunderbarer Tag, um etwas Neues zu lernen."
    ],
    "Spanish": [
        "El rapido zorro marron salta sobre el perro perezoso.",
        "La Revolucion Espanola fue un momento clave en la historia.",
        "Hola, ¿cómo estás?",
        "Es un gran día para aprender algo nuevo."
    ],
}


# Normalize training data to ASCII
labels = []
texts = []

for language, samples in training_texts.items():
    labels.extend([language] * len(samples))
    texts.extend([normalize_to_ascii(sample) for sample in samples])


# Step 2: Preprocessing and Feature Extraction
vectorizer = TfidfVectorizer(ngram_range=(2, 4), analyzer="char")
X_train = vectorizer.fit_transform(texts)


# Step 3: Train the Model
classifier = MultinomialNB()
classifier.fit(X_train, labels)


# Step 4: Serialize the Model
with open("language_model.pkl", "wb") as model_file:
    pickle.dump((vectorizer, classifier), model_file)


# Step 5: Language Detection Function
def detect_language(snippet):
    with open("language_model.pkl", "rb") as model_file:
        vectorizer, classifier = pickle.load(model_file)

    # Normalize snippet to ASCII
    snippet = normalize_to_ascii(snippet)
    X_test = vectorizer.transform([snippet])
    prediction = classifier.predict(X_test)
    return prediction[0]


# Input Processing (Single-line input)
if __name__ == "__main__":
    # Get user input
    snippet = input("Enter a text snippet to detect the language: ")

    # Predict and Output
    detected_language = detect_language(snippet.strip())
    print(f"Detected language: {detected_language}")


Enter a text snippet to detect the language:  El rápido zorro marrón salta sobre el perro perezoso
Detected language: Spanish


5.The missing Apostrophes.


In [None]:
import re

# Function to handle apostrophes for contractions and possessives
def restore_apostrophes(text):
    restored_text = []
    words = text.split()

    for word in words:
        lower_word = word.lower()

        # Handle contractions
        if lower_word == "dont":
            restored_text.append("don't")
        elif lower_word == "wont":
            restored_text.append("won't")
        elif lower_word == "cant":
            restored_text.append("can't")
        elif lower_word == "isnt":
            restored_text.append("isn't")
        elif lower_word == "arent":
            restored_text.append("aren't")
        elif lower_word == "wasnt":
            restored_text.append("wasn't")
        elif lower_word == "werent":
            restored_text.append("weren't")
        elif lower_word == "hasnt":
            restored_text.append("hasn't")
        elif lower_word == "havent":
            restored_text.append("haven't")
        elif lower_word == "hadnt":
            restored_text.append("hadn't")
        elif lower_word == "didnt":
            restored_text.append("didn't")
        elif lower_word == "ive":
            restored_text.append("I've")
        elif lower_word == "were":
            restored_text.append("we're")
        elif lower_word == "i":
            restored_text.append("I")
        elif lower_word == "id":
            restored_text.append("I'd")
        elif lower_word == "youve":
            restored_text.append("you've")
        elif lower_word == "hes":
            restored_text.append("he's")
        elif lower_word == "shes":
            restored_text.append("she's")
        elif lower_word == "its":
            restored_text.append("it's")
        elif lower_word == "were":
            restored_text.append("we're")

        # Handle possessives (only add 's when it makes sense)
        elif re.match(r'\w+s$', word) and lower_word not in ["its", "hers", "ours", "yours", "theirs"]:
            restored_text.append(re.sub(r"s$", "'s", word))

        # For normal words that don't need apostrophes, keep them as is
        else:
            restored_text.append(word)

    return " ".join(restored_text)


# Input
input_text = """At a news conference Thursday at the Russian manned-space facility in Baikonur, Kazakhstan, Kornienko said "we will be missing nature, we will be missing landscapes, woods." He admitted that on his previous trip into space in 2010 "I even asked our psychological support folks to send me a calendar with photographs of nature, of rivers, of woods, of lakes."
Kelly was asked if hed miss his twin brother Mark, who also was an astronaut.

"Were used to this kind of thing," he said. "Ive gone longer without seeing him and it was great."
The mission wont be the longest time that a human has spent in space - four Russians spent a year or more aboard the Soviet-built Mir space station in the 1990s.
SCI Astronaut Twins
Scott Kelly (left) was asked Thursday if hed miss his twin brother, Mark, who also was an astronaut. Were used to this kind of thing, he said. Ive gone longer without seeing him and it was great. (NASA/Associated Press)
"The last time we had such a long duration flight was almost 20 years and of course al{-truncated-}"""

# Restore apostrophes
output_text = restore_apostrophes(input_text)
print(output_text)


At a new's conference Thursday at the Russian manned-space facility in Baikonur, Kazakhstan, Kornienko said "we will be missing nature, we will be missing landscapes, woods." He admitted that on hi's previou's trip into space in 2010 "I even asked our psychological support folk's to send me a calendar with photograph's of nature, of rivers, of woods, of lakes." Kelly wa's asked if hed mis's hi's twin brother Mark, who also wa's an astronaut. "Were used to thi's kind of thing," he said. "Ive gone longer without seeing him and it wa's great." The mission won't be the longest time that a human ha's spent in space - four Russian's spent a year or more aboard the Soviet-built Mir space station in the 1990s. SCI Astronaut Twin's Scott Kelly (left) wa's asked Thursday if hed mis's hi's twin brother, Mark, who also wa's an astronaut. we're used to thi's kind of thing, he said. I've gone longer without seeing him and it wa's great. (NASA/Associated Press) "The last time we had such a long durat

6.Segment the Twitter Hashtags.

In [None]:
# Define a function that segments a single hashtag into words
def segment_hashtag(hashtag, word_dict):
    n = len(hashtag)
    dp = [None] * (n + 1)

    dp[0] = []  # Base case: empty string can be segmented as an empty list

    # Iterate over the hashtag string
    for i in range(1, n + 1):
        for j in range(max(0, i - 20), i):  # Limit the length of words checked
            word = hashtag[j:i]
            if word in word_dict and dp[j] is not None:
                dp[i] = dp[j] + [word]
                break

    return " ".join(dp[n]) if dp[n] is not None else hashtag


# Main function to process input and output results
def process_hashtags(num_hashtags, hashtags, word_dict):
    result = []
    for hashtag in hashtags:
        segmented = segment_hashtag(hashtag, word_dict)
        result.append(segmented)
    return result


# Sample dictionary of common words (expand this as needed)
word_dict = {
    "we", "are", "the", "people", "mention", "your", "faves",
    "now", "playing", "walking", "dead", "follow", "me"
}


# Sample input
num_hashtags = int(input("Enter the number of hashtags: "))
hashtags = [input(f"Enter hashtag {i + 1}: ").strip() for i in range(num_hashtags)]


# Process the hashtags and print the result
segmented_hashtags = process_hashtags(num_hashtags, hashtags, word_dict)
for segmented in segmented_hashtags:
    print(segmented)


Enter the number of hashtags: 2
Enter hashtag 1: wearethepeople
Enter hashtag 2:  playingwalkingdead
we are the people
playing walking dead


7.Expand the Acronyms.


In [None]:
import re

def extract_acronyms_and_expansions(snippets):
    """
    Extract acronyms and their expansions from the provided snippets.
    """
    acronym_dict = {}

    for snippet in snippets:
        print(f"Processing snippet: {snippet}")

        # 1. Find all potential acronyms (uppercase words typically enclosed in parentheses)
        matches = re.findall(r'\(([^)]+)\)', snippet)  # Capture everything inside parentheses
        print(f"Found acronyms in parentheses: {matches}")

        for match in matches:
            # Split the match by spaces to capture the acronym and the expansion
            acronym_expansion = match.split(' ', 1)
            if len(acronym_expansion) == 2:  # If we have both acronym and expansion
                acronym = acronym_expansion[0].strip()
                expansion = acronym_expansion[1].strip()
                acronym_dict[acronym] = expansion
                print(f"Captured acronym-expansion pair: {acronym} -> {expansion}")

        # 2. Handle acronyms not in parentheses but defined explicitly (case-sensitive)
        words = snippet.split()
        for i, word in enumerate(words):
            if word.isupper() and len(word) > 1:  # Likely an acronym
                if word not in acronym_dict:
                    # Try to extract its expansion from the surrounding context
                    preceding_context = " ".join(words[max(0, i-5):i])
                    if preceding_context:
                        acronym_dict[word] = preceding_context.strip()
                        print(f"Captured explicit expansion for {word}: {preceding_context}")

    return acronym_dict


def process_tests(acronym_dict, tests):
    """
    Process test acronyms and return their expansions.
    """
    results = []

    for test in tests:
        # Normalize the test acronym (case insensitive)
        expansion = acronym_dict.get(test.upper(), "Not Found")
        print(f"Processing test acronym: {test}, found expansion: {expansion}")
        results.append(expansion)

    return results


def main():
    # Read input
    n = int(input("Enter number of snippets: ").strip())

    snippets = [input(f"Enter snippet {i + 1}: ").strip() for i in range(n)]
    tests = [input(f"Enter test acronym {i + 1}: ").strip() for i in range(n)]

    # Extract acronyms and expansions
    acronym_dict = extract_acronyms_and_expansions(snippets)

    # Process test queries
    results = process_tests(acronym_dict, tests)

    # Output results
    print("\nResults:")
    for result in results:
        print(result)


if __name__ == "__main__":
    main()


Enter number of snippets: 1
Enter snippet 1: The system of Local Area Network (LAN) allows communication.
Enter test acronym 1:  LAN
Processing snippet: The system of Local Area Network (LAN) allows communication.
Found acronyms in parentheses: ['LAN']
Captured explicit expansion for (LAN): system of Local Area Network
Processing test acronym: LAN, found expansion: Not Found

Results:
Not Found


8.Correct the Search Query.

In [None]:
import re
import pickle
import zlib
from collections import Counter


# Build corpus from a sample dictionary (you can enhance it with more words)
words = """going to china who was the first president of india winner of the match food in america"""

# Function to return a list of words from the text
def words_list(text):
    return re.findall(r'\w+', text.lower())

# Count words in the corpus
WORDS = Counter(words_list(words))

# Compression for large wordlist
with open('compressed_dict.pkl', 'wb') as f:
    compressed = zlib.compress(pickle.dumps(WORDS))
    f.write(compressed)

# Load dictionary in memory
def load_dictionary():
    with open('compressed_dict.pkl', 'rb') as f:
        return pickle.loads(zlib.decompress(f.read()))

# Generate a set of possible words with a single edit distance
def edit_distance_one(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'

    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]

    return set(deletes + transposes + replaces + inserts)

# Known words in the dictionary
def known(words, dictionary):
    return set(w for w in words if w in dictionary)

# Candidates for correction based on edit distance
def candidates(word, dictionary):
    return (known([word], dictionary) or known(edit_distance_one(word), dictionary) or [word])

# Correct a single word
def correct_word(word, dictionary):
    return max(candidates(word, dictionary), key=dictionary.get)

# Correct a whole query
def correct_query(query, dictionary):
    return ' '.join(correct_word(word, dictionary) for word in query.split())

# Main correction function
if __name__ == "__main__":
    # Load the dictionary
    dictionary = load_dictionary()

    # Input number of queries
    n = int(input("Enter the number of queries: ").strip())

    # Input queries
    queries = [input(f"Enter query {i+1}: ").strip() for i in range(n)]

    # Process each query and correct it
    for query in queries:
        print(correct_query(query, dictionary))


Enter the number of queries: 2
Enter query 1: goin to china
Enter query 2: winer of the match
going to china
winner of the match


9.AText-Processing Wramup.

In [None]:
import re

def count_articles_and_dates(fragment):
    """
    Count occurrences of 'a', 'an', 'the', and valid dates in a given text fragment.
    """
    # Normalize text for article counting
    lower_fragment = fragment.lower()

    # Count articles (simplified to handle punctuation better)
    a_count = len(re.findall(r'\ba\b', lower_fragment))
    an_count = len(re.findall(r'\ban\b', lower_fragment))
    the_count = len(re.findall(r'\bthe\b', lower_fragment))

    # Debugging: Print counts of articles
    print(f"Fragment: {fragment}")
    print(f"a_count: {a_count}, an_count: {an_count}, the_count: {the_count}")

    # Identify valid dates
    date_patterns = [
        # Day Month Year (e.g., 5th January 2023)
        r'\b\d{1,2}(?:st|nd|rd|th)?(?:\s+of)?\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{2,4}\b',

        # Month Day Year (e.g., January 15, 2021)
        r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd)?(?:,?)?\s+\d{2,4}\b',

        # Day/Month/Year (e.g., 12/02/2021)
        r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',  # Day/Month/Year

        # ISO format: Year-Month-Day (e.g., 2021-12-15)
        r'\b\d{4}-\d{2}-\d{2}\b'  # ISO format: Year-Month-Day
    ]

    # Combine all date patterns
    date_regex = '|'.join(date_patterns)
    dates = re.findall(date_regex, fragment, re.IGNORECASE)

    # Debugging: Print found dates
    print(f"Found dates: {dates}")

    date_count = len(dates)

    return a_count, an_count, the_count, date_count

def main():
    # Directly simulate input for testing (use your own test cases here)
    data = """2
I went to the park on 5th January 2023.
She arrived on 12/02/2021 and met John on January 15, 2021."""

    data = data.strip().split("\n")

    # Proceed if data is not empty
    if not data:
        print("Error: No input data provided")
        return

    try:
        t = int(data[0].strip())  # Number of test cases
    except ValueError:
        print("Error: Invalid number of test cases")
        return

    fragments = data[1:]  # Remaining lines contain the fragments

    if len(fragments) != t:
        print(f"Error: Expected {t} fragments, but got {len(fragments)}")
        return

    results = []
    for i in range(t):
        fragment = fragments[i].strip()  # Count articles and dates
        a_count, an_count, the_count, date_count = count_articles_and_dates(fragment)
        results.append(f"{a_count}\n{an_count}\n{the_count}\n{date_count}")

    # Output results
    print("\n".join(results))

if __name__ == "__main__":
    main()


Fragment: I went to the park on 5th January 2023.
a_count: 0, an_count: 0, the_count: 1
Found dates: [('January', '')]
Fragment: She arrived on 12/02/2021 and met John on January 15, 2021.
a_count: 0, an_count: 0, the_count: 0
Found dates: [('', ''), ('', 'January')]
0
0
1
1
0
0
0
2


10.Who is it?

In [None]:
import re

def resolve_pronouns(text, entities):
    """
    Resolves pronouns in the text based on provided entities.
    """
    # Extract all pronouns and their positions
    pronoun_pattern = r'\b(\w+)\b'  # Updated pattern to match any word
    pronouns = [(match.group(1), match.start()) for match in re.finditer(pronoun_pattern, text)]

    # Clean the text by removing ** markers (if any)
    clean_text = re.sub(r'\*\*(\w+)\*\*', r'\1', text)

    # Initialize a list to store the resolved entities
    resolved = []

    # For each pronoun, find the corresponding entity
    for pronoun, pos in pronouns:
        closest_entity = None
        closest_distance = float('inf')

        # Iterate through all entities to find the best match for the pronoun
        for entity in entities:
            entity_pos = clean_text.rfind(entity, 0, pos)  # Find the last occurrence of the entity before the pronoun
            if entity_pos != -1:
                distance = pos - (entity_pos + len(entity))
                if distance < closest_distance:
                    closest_distance = distance
                    closest_entity = entity

        # Append the resolved entity to the list
        if closest_entity:
            resolved.append(closest_entity)

    return resolved

def main():
    # Read input interactively
    print("Enter number of text snippets:")
    n = int(input().strip())  # Read number of snippets

    print(f"Enter {n} lines of text:")
    text_snippet = " ".join(input().strip() for _ in range(n))  # Read text snippet lines

    print("Enter entities (separated by ';'):")
    entities = [e.strip() for e in input().strip().split(';')]  # Read list of entities

    # Resolve pronouns
    result = resolve_pronouns(text_snippet, entities)

    # Output the resolved entities
    for entity in result:
        print(entity)

if __name__ == "__main__":
    main()


Enter number of text snippets:
2
Enter 2 lines of text:
John and Mary went to the park. She gave him a gift.
She gave him a gift
Enter entities (separated by ';'):
John; Mary
John
John
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
