# Worksheet 2

In [1]:
import re

In [2]:
# German ZIP codes

txt = """Max Mustermann
Musterstraße 12
10115 Berlin
GERMANY
"""

x = re.findall(r"[0-9]{5}", txt)

x

['10115']

In [3]:
# Academic titles (Prof., Prof, Dr., Dr, , B.Sc., , BSc, , M.Sc., , MSc, , B.A., ...)

txt = """
Prof., Prof, Dr., Dr, , B.Sc., , BSc, , M.Sc., , MSc, , B.A., ...
"""

x = re.findall(r"[A-Z]+\.?[A-Z]?[a-z]{0,3}[.,]", txt)

x

['Prof.', 'Prof,', 'Dr.', 'Dr,', 'B.Sc.', 'BSc,', 'M.Sc.', 'MSc,', 'B.A.']

In [4]:
# Test whether a binary string (i.e. a string that just contains 0s and 1s) 
# contains the substring “01” and “10” an equal number of times

txt = """
01100110011001100110011010110
"""

x = re.findall(r"01", txt)
y = re.findall(r"10", txt)

len(x) == len(y)

True

## 3. Minimum Distance Algorithm

In [5]:
def sub_cost(val1, val2):
    if val1 == val2:
        return 0
    else:
        return 1

In [6]:
def min_edit_distance(source, target, del_cost = 1, ins_cost = 1):
    n = len(source)
    m = len(target)
    d = [[0 for _ in range(m + 1)] for _ in range(n + 1)]
        
    # Filling up the first column and row of the matrix
    for i in range(1, n + 1):
        d[i][0] = d[i-1][0] + del_cost
    for j in range(1, m + 1):
        d[0][j] = d[0][j-1] + ins_cost
        
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            d[i][j] = min(
                d[i-1][j] + del_cost,  # Deletion
                d[i][j-1] + ins_cost,  # Insertion
                d[i-1][j-1] + sub_cost(source[i-1], target[j-1])  # Substitution
            )
            
    return d[n][m]

In [7]:
min_edit_distance("kitten", "sitting")

3

In [8]:
min_edit_distance("kitten", "kit")

3

In [9]:
min_edit_distance("bag", "raggedy")

5

In [10]:
min_edit_distance("color", "colour")

1

In [11]:
min_edit_distance("color", "colour", del_cost=2, ins_cost=2)

2

## 5. Stemming

In [12]:
sentences = [
    "I own multiple cats.",
    "Marburg is a nice city.",
    "It's fairly cold in here."
]

In [13]:
from nltk.tokenize import word_tokenize

tokenized_sentences = [word_tokenize(sentence, language='english', preserve_line=False) for sentence in sentences]

In [14]:
tokenized_sentences

[['I', 'own', 'multiple', 'cats', '.'],
 ['Marburg', 'is', 'a', 'nice', 'city', '.'],
 ['It', "'s", 'fairly', 'cold', 'in', 'here', '.']]

In [15]:
from nltk.stem import *

In [16]:
from nltk.stem.porter import *
porter_stemmer = PorterStemmer()

In [17]:
porter_stemmed = [(" ").join(porter_stemmer.stem(word) for word in sentence) for sentence in tokenized_sentences]

In [18]:
porter_stemmed

['i own multipl cat .',
 'marburg is a nice citi .',
 "it 's fairli cold in here ."]

In [19]:
from nltk.stem.snowball import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")

In [20]:
snowball_stemmed = [(" ").join(snowball_stemmer.stem(word) for word in sentence) for sentence in tokenized_sentences]

In [21]:
snowball_stemmed

['i own multipl cat .',
 'marburg is a nice citi .',
 "it 's fair cold in here ."]

In [22]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [23]:
lemmatized_tokens = [(" ").join(wnl.lemmatize(word) for word in sentence) for sentence in tokenized_sentences]

In [24]:
lemmatized_tokens

['I own multiple cat .',
 'Marburg is a nice city .',
 "It 's fairly cold in here ."]

In [25]:
from nltk.tag import pos_tag

pos_tagged = pos_tag(word_tokenize(sentences[0]))
pos_tagged

[('I', 'PRP'), ('own', 'VBP'), ('multiple', 'JJ'), ('cats', 'NNS'), ('.', '.')]

In [26]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [27]:
lemmatized_tokens_with_tags = [[wnl.lemmatize(word, pos=get_wordnet_pos(word)) for word in sentence] for sentence in tokenized_sentences]
lemmatized_tokens_with_tags

[['I', 'own', 'multiple', 'cat', '.'],
 ['Marburg', 'is', 'a', 'nice', 'city', '.'],
 ['It', "'s", 'fairly', 'cold', 'in', 'here', '.']]

## 6. Sentiment Analysis

In [35]:
from textblob import TextBlob

text = """
We waited one hour in line to check in for our flight. Juuust great, exactly what you need to start of your vacation!
"""

blob = TextBlob(text)

In [36]:
blob.tags

[('We', 'PRP'),
 ('waited', 'VBD'),
 ('one', 'CD'),
 ('hour', 'NN'),
 ('in', 'IN'),
 ('line', 'NN'),
 ('to', 'TO'),
 ('check', 'VB'),
 ('in', 'IN'),
 ('for', 'IN'),
 ('our', 'PRP$'),
 ('flight', 'NN'),
 ('Juuust', 'NNP'),
 ('great', 'JJ'),
 ('exactly', 'RB'),
 ('what', 'WP'),
 ('you', 'PRP'),
 ('need', 'VBP'),
 ('to', 'TO'),
 ('start', 'VB'),
 ('of', 'IN'),
 ('your', 'PRP$'),
 ('vacation', 'NN')]

In [37]:
blob.noun_phrases

WordList(['juuust'])

In [38]:
for sentence in blob.sentences:
    print(sentence.sentiment.polarity)

0.0
0.55625


## LLMs

### Prompt

Analyze the sentiment of this sentence : We waited one hour in line to check in for our flight. Juuust great, exactly what you need to start of your vacation!

#### Meta Llama 3.1 8B Instruct

The sentiment of this sentence is overwhelmingly negative. The speaker is expressing frustration and annoyance at having to wait in line for an hour to check in for their flight, which is supposed to be a relaxing and enjoyable experience, especially at the beginning of a vacation.

The use of the phrase "Juuust great" is a sarcastic expression of disappointment and sarcasm, implying that waiting in line for an hour is not a great way to start a vacation. The tone is also somewhat ironic, as the speaker is highlighting the opposite of what one would expect to experience at the beginning of a vacation, which is typically meant to be a time of relaxation and enjoyment.

#### OpenAI GPT-4.1 (External)

The sentiment of the sentence is negative, with a strong tone of sarcasm. The literal description ("We waited one hour in line to check in for our flight") reports an inconvenient circumstance, and the follow-up ("Juuust great, exactly what you need to start off your vacation!") uses exaggerated language and sarcasm to highlight the speaker's dissatisfaction and frustration. The speaker is clearly unhappy with the situation and is using sarcasm to express their annoyance.