# Anonymize restaurant names

This notebook was used to anonymize restaurant names in the Deliverando project.

I followed "A Practical Guide to Anonymizing Datasets with Python & Faker" at https://medium.com/district-data-labs/a-practical-guide-to-anonymizing-datasets-with-python-faker-ecf15114c9be and updated, broadened and automated it with the help of ChatGPT.

In [1]:
import pandas as pd   # work with data frames
from faker import Factory   # to create instance generating random restaurant names
import csv   # to read and write CSV files
from tqdm import tqdm   # to check progress

Import data and save everythng to CSV files with comma searated variables. This is needed for automating the process.

In [2]:
df_deliverando = pd.read_csv('../data/SalesAnalyst_deliverando.csv', sep=';')
df_deliverando.to_csv('../data/deliverando.csv', index=False)

df_competition_month1 = pd.read_excel('../data/SalesAnalyst_Competition.xlsx', sheet_name = 'Month 1')
df_competition_month1.to_csv('../data/comp_month_1.csv', index=False)

df_competition_month2 = pd.read_excel('../data/SalesAnalyst_Competition.xlsx', sheet_name = 'Month 2')
df_competition_month2.to_csv('../data/comp_month_2.csv', index=False)

AI generated lists with words often used in Austrian restaurants. Used later for generation of random names for authenticity.

In [3]:
# Generate a fake Austrian-sounding restaurant name
adjectives = [
    "Alpen", "Wiener", "Kaiser", "Berg", "Mozart", "Hof",
    "Grüne", "Goldene", "Herzog", "Roter", "Blauer", "Gasthof", "Schloss",
    "Wald", "Gemütliche", "Kleine", "Große", "Romantische", "Traditionelle",
    "Grüne", "Bunte", "Himmel", "Holz", "Alte", "Neue", "Wirtshaus",
    "Mühle", "Heimat", "Bergdorf", "Edle", "Stille", "Sonnen", "Wein",
    "Dörfler", "Gebackene", "Bauer", "Stern", "Schöne", "Festliche",
    "Königliche", "Sonnige", "Bäuerliche", "Weinstube", "Fischer",
    "Blumen", "Fröhliche", "Hohe", "Gute", "Ruhige", "Zirben",
    "Edelweiß", "Elegante", "Märchenhafte", "Rustikale", "Rote", "Blaue", "Grüne",
    "Bunte", "Glänzende", "Stilvolle", "Prachtvolle", "Magische", "Ruhige", "Moderne",
    "Traditionsreiche", "Köstliche", "Herzliche", "Exotische", "Rustikale", "Verträumte",
    "Gesellige", "Idyllische", "Romantische", "Ehrwürdige", "Wolkenkratzer", "Maritime",
    "Alpine", "Ländliche", "Winterliche", "Scharfe", "Sanfte", "Nostalgische", "Erfahrene",
    "Herbstliche", "Verträumte", "Einsame", "Gastfreundliche", "Kultivierte", "Goldene",
    "Malerische", "Blaue", "Glanzvolle", "Rote", "Verzauberte", "Geheimnisvolle", "Frische",
    "Rasante", "Silberne", "Rauchige", "Adventurous", "Pittoreske", "Quaint", "Countrified",
    "Legendary", "Opulent", "Bustling"
]

nouns = [
    "Stube", "Hofbräu", "Küche", "Gourmet", "Palast",
    "Wirtshaus", "Brauerei", "Schmankerl", "Dorf", "Alm",
    "Garten", "Stubn", "Hütte", "Ecke", "Platz",
    "Taverne", "Winkel", "Tal", "See", "Hof",
    "Weinstube", "Stadl", "Mühle", "Bauernhof", "Hofladen",
    "Gaststube", "Biergarten", "Keller", "Inn", "Blick",
    "Krug", "Waldhaus", "Burg", "Küche", "Stuben",
    "Haus", "Hofladen", "Tisch", "Scheune", "Platzl",
    "Gasthof", "Eck", "Schmiede", "Kneipe", "Dorfschenke",
    "Stüberl", "Kaminzimmer", "Ort", "Zirbelstube",
    "Bräustüberl", "Felsenkeller", "Almhütte", "Bierstube", "Herrenhaus",
    "Heuriger", "Jausenstation", "Kellerstüberl", "Klause", "Kneipenlokal",
    "Lokal", "Pinte", "Ratskeller", "Rebgarten", "Schanke", "Schenke",
    "Schlösschen", "Speiselokal", "Tanzlokal", "Tanzschuppen", "Weinkeller",
    "Bodega", "Braustube", "Esszimmer", "Festscheune", "Feuerstelle", "Gastgewölbe",
    "Gastzimmer", "Gesellschaftsraum", "Grillplatz", "Herberge", "Ratsstube",
    "Rittersaal", "Salons", "Schänke", "Schankstube", "Schanze", "Schlemmerparadies",
    "Speisesaal", "Südseeparadies", "Tafelspitz", "Tenne", "Verköstigungszimmer",
    "Wanderhütte", "Weinbar", "Weinbistro", "Weinkneipe", "Weinlokal", "Wirt",
    "Zimmer", "Bierbrunnen", "Brauhaus", "Café", "Club", "Dive", "Drinkery",
    "Eatery", "Ginmill", "Grillroom", "Hideaway", "Hangout", "Juke joint", "Pizzeria",
    "Rathskeller", "Soda fountain", "Speakeasy", "Taverna", "Watering hole",
    "Winery"
]

Definition of functions used to generate random Autrian-sounding names and keep them consistent across all files (mapping).

In [4]:
def anonymize_rows(rows, name_mappings):
    """
    Rows is an iterable of dictionaries that contain name and
    email fields that need to be anonymized.
    """
    for row in rows:
        original_name = row['name']
        anonymized_name = name_mappings.get(original_name, original_name)
        # Replace the name field with the generated or mapped restaurant name
        row['name'] = anonymized_name
        yield row

def anonymize(source_files, target_files, adjectives, nouns):
    """
    Anonymize multiple source CSV files and save the anonymized data to multiple target CSV files.
    """
    # Create an overall dictionary to store consistent mappings across all files
    overall_name_mappings = {}
    faker = Factory.create()
    
    for source, target in zip(source_files, target_files):
        with open(source, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            
            # Use tqdm to create a progress bar for the rows in the CSV file
            for row in tqdm(reader, desc=f"Processing {source}"):
                original_name = row['name']
                
                # Check if the original name has been mapped already
                if original_name in overall_name_mappings:
                    anonymized_name = overall_name_mappings[original_name]
                else:
                    # Generate a unique restaurant name
                    restaurant_name = generate_unique_name(adjectives, nouns, overall_name_mappings, faker)
                    overall_name_mappings[original_name] = restaurant_name
                    anonymized_name = restaurant_name
                
                # Replace the name field with the generated or mapped restaurant name
                row['name'] = anonymized_name
                
            # Rewind the CSV file to the beginning
            f.seek(0)
            
            with open(target, 'w', encoding='utf-8', newline='') as o:
                fieldnames = reader.fieldnames
                
                # Create a writer for the target file
                writer = csv.DictWriter(o, fieldnames=fieldnames)
                writer.writeheader()  # Write the header row
                
                # Process and anonymize rows
                anonymized_rows = anonymize_rows(reader, overall_name_mappings)
                
                for row in tqdm(anonymized_rows, desc=f"Writing {target}"):
                    # Write the anonymized row to the target file
                    writer.writerow(row)
    
    # After processing all files, the overall_name_mappings will contain the mapping across all files
    return overall_name_mappings

def generate_unique_name(adjectives, nouns, name_mappings, faker):
    """
    Generate a unique restaurant name by checking against existing mappings.
    """
    while True:
        restaurant_name = faker.random_element(adjectives) + " " + faker.random_element(nouns)
        
        # Check if the generated name is unique within the current mappings
        if restaurant_name not in name_mappings.values():
            return restaurant_name

Executing the renaming, creating new CSV files with anonymized names.

In [11]:
source_files = ['../data/deliverando.csv',
                '../data/comp_month_1.csv',
                '../data/comp_month_2.csv']
target_files = ['../data/deliverando_anonymized.csv',
                '../data/comp_month_1_anonymized.csv',
                '../data/comp_month_2_anonymized.csv']

anonymize(source_files, target_files, adjectives, nouns)

Processing ../data/deliverando.csv: 1648it [00:00, 150724.23it/s]
Writing ../data/deliverando_anonymized.csv: 1649it [00:00, 98725.43it/s]
Processing ../data/comp_month_1.csv: 3207it [00:00, 22624.78it/s]
Writing ../data/comp_month_1_anonymized.csv: 3208it [00:00, 107840.18it/s]
Processing ../data/comp_month_2.csv: 3242it [00:00, 179536.75it/s]
Writing ../data/comp_month_2_anonymized.csv: 3243it [00:00, 115977.97it/s]
