# Fuzzy Search, Prefix Edit Dist, Q-Gram Index

## About the data

In [318]:
import pandas as pd

movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,title,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


## Levenshtein distance

In [319]:
import numpy as np

def levenshtein_dist(x: str, y: str) -> int:
    x, y = '$' + x, '$' + y
    mat = np.zeros((len(x), len(y)), dtype=int)
    for i in range(0, len(x)): mat[i][0] = i
    for j in range(0, len(y)): mat[0][j] = j
    for i in range(1, len(x)):
        for j in range(1, len(y)):
            if x[i] != y[j]:
                mat[i][j] = min(mat[i-1][j], mat[i-1][j-1], mat[i][j-1]) + 1
            else: 
                mat[i][j] = min(mat[i-1][j], mat[i-1][j-1], mat[i][j-1])
    return mat[len(x)-1][len(y)-1]

In [320]:
print(f"levenshtein_dist(hello, hallo) = {levenshtein_dist('hello', 'hallo')}")
print(f"levenshtein_dist(kid, kind) = {levenshtein_dist('kid', 'kind')}")
print(f"levenshtein_dist(cat, wildcat) = {levenshtein_dist('cat', 'wildcat')}")

levenshtein_dist(hello, hallo) = 1
levenshtein_dist(kid, kind) = 1
levenshtein_dist(cat, wildcat) = 4


## Q-gramms

Definition: Let x, y be two strings.



In [321]:
def compute_qgram(x: str, q: int, padding: int = 3) -> list[str]:
    # Add padding to x. 
    x_padded = padding * '$' + x + padding * '$'
    qgram = [x_padded[i:i+q] for i in range(0, len(x_padded)) if len(x_padded[i:i+q]) == q] 
    return qgram

In [322]:
print(compute_qgram('freiburg', q=3, padding=0))
print(compute_qgram('freiburg', q=3, padding=3))

['fre', 'rei', 'eib', 'ibu', 'bur', 'urg']
['$$$', '$$f', '$fr', 'fre', 'rei', 'eib', 'ibu', 'bur', 'urg', 'rg$', 'g$$', '$$$']


In [323]:
import pandas as pd
import re
from collections import Counter, defaultdict

class QGramIndex:
    def __init__(self, q: int = 3, padding: int = 2) -> None:
        # word: -> [(doc_id, frequency)] 
        self.inverted_lists: dict[str: list[tuple[int, int]]] = {}
        
        self.names: list[str]= [] 
        self.q = q
        self.padding = padding 

    def get_words(self, text: str) -> list[str]:
        """
        >>> text = 'You are... awesome!'
        >>> words = get_words(text)
        >>> words
        ['you', 'are', 'awesome']
        """
        WORD_PATTERN = '[a-zA-Z]+'
        return re.findall(WORD_PATTERN, str(text).lower())

    def build_from_file(self, file_name: str) -> None:
        # Interprete each line as a document.
        df = pd.read_csv(file_name)
        doc_id: int = 1
        for _, row in df.iterrows():
            row_data = {'title': row['title'],
                        'date_x': row['date_x'],
                        'score': row['score'], 
                        'genre': row['genre'], 
                        'overview': row['overview'],
                        'crew': row['crew']
                        }
            # title, date, score, genre ,overview, crew = self.get_words(row_data['title']), row_data['date_x'], row_data['score'], self.get_words(row_data['genre']), self.get_words(row_data['overview']), self.get_words(row_data['crew'])
            title = self.get_words(row_data['title'])
            # for word in title + overview + genre + crew:
            for word in title:
                for qgram in compute_qgram(word, self.q, self.padding):
                    if qgram not in self.inverted_lists:
                        self.inverted_lists[qgram] = [(doc_id, 1)]
                    else: 
                        if self.inverted_lists[qgram][-1][0] == doc_id: 
                            new_score = self.inverted_lists[qgram][-1][1] + 1
                            self.inverted_lists[qgram][-1] = (doc_id, new_score)
                        else:
                            self.inverted_lists[qgram].append((doc_id, 1))
            self.names.append(row['title'])
            doc_id += 1

    def intersect(self, query: str, threshold: int = 10) -> list:
        query_qgrams = Counter(compute_qgram(query, self.q, self.padding))
        inter = defaultdict(int)

        for qgram, count in query_qgrams.items():
            if qgram in self.inverted_lists:
                for doc_id, freq in self.inverted_lists[qgram]:
                    inter[doc_id] += min(count, freq)  # Zählt die Häufigkeit der Übereinstimmung
        possible_matches = sorted(inter.items(), key=lambda x: x[1], reverse=True)
        
        matches:list = []
        for doc_id, freq in possible_matches:
            levenshtein = levenshtein_dist(query, self.names[doc_id-1])
            if levenshtein < threshold:
                matches.append((doc_id, freq))
        return matches

    def generate_output(self, query: str, output_threshold: int = 15) -> None:
        query_res = self.intersect(query)
        print(f'Query: {query}')
        for i, (doc_id, freq) in enumerate(query_res):
            print(f"Title: {self.names[doc_id-1]}\t Match: {freq}")
            # print(self.names[doc_id-1])
            if i > output_threshold:
                break


### Query: ion man
#### Wanted: iron man

In [324]:
SearchEngine = QGramIndex(q=3, padding=2)
SearchEngine.build_from_file('movies.csv')
SearchEngine.generate_output('ion man', output_threshold=10)

Query: ion man
Title: Irrational Man	 Match: 5
Title: Iron Man 2	 Match: 4
Title: Iron Man 3	 Match: 4
Title: Iron Man	 Match: 4
Title: The Irishman	 Match: 4
Title: Indecent Woman	 Match: 4
Title: Ip Man 3	 Match: 4
Title: Inside Man	 Match: 4
Title: Superman II	 Match: 4
Title: The Iceman	 Match: 4
Title: Superman III	 Match: 4
Title: Inhuman Kiss	 Match: 4


### Query: arangers
#### Wanted: powerrangers

In [325]:
SearchEngine.generate_output('arangers')

Query: stsar was
Title: The Strays	 Match: 3
Title: San Andreas	 Match: 3
Title: Star Wars	 Match: 3
Title: Strays	 Match: 3
Title: Storks	 Match: 3
Title: Sarah's Key	 Match: 3
Title: Shottas	 Match: 3
Title: Street Kings	 Match: 3
Title: Stasis	 Match: 3
Title: Status Update	 Match: 3
Title: 100 Streets	 Match: 3
Title: Straw Dogs	 Match: 3
Title: Strange Days	 Match: 3
Title: Stripes	 Match: 3
Title: St. Trinian's	 Match: 3
Title: Stitches	 Match: 3
Title: Stand Up Guys	 Match: 3
