# Build social distancing data from tweets

Required fields:

1. tweet id
2. user id
3. time stamp
4. municipality
5. stance with respect to social distancing

## 1. Select social distancing tweets

In [1]:
import os
import pandas as pd
import re
from nltk.tokenize import TweetTokenizer
from IPython.display import clear_output

In [2]:
BASE_DIR = "../data/text/"
DISTANCE_QUERY = "1[.,]5[ -]*m|afstand.*hou|hou.*afstand|anderhalve[ -]*meter"
FILE_PATTERN = "^(2020111[3-6])"
TEXT = "text"

In [3]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)


def cleanup(text):
    text = re.sub(r"\\n"," ",text)
    text = re.sub(r"https://\S+","",text)
    text = re.sub(r"\s+"," ",text)
    text = text.strip()
    return(text)


def tokenize(text):
    return(" ".join(TweetTokenizer().tokenize(text)))


def preprocess(text):
    return(tokenize(cleanup(text)).lower())

In [4]:
query = DISTANCE_QUERY
files = sorted(os.listdir(BASE_DIR))
results_df = pd.DataFrame({})
preprocessed_texts = []
for file_name in files:
    if re.search(FILE_PATTERN, file_name):
        squeal(file_name)
        file_data = pd.read_csv(BASE_DIR + file_name)
        matched_text = file_data[file_data[TEXT].str.contains(query, case=False)]
        preprocessed_texts.extend(matched_text[TEXT].apply(lambda x: preprocess(x)))
        if len(results_df) == 0:
            results_df = matched_text.copy()
        else:
            results_df = pd.concat([results_df, matched_text], ignore_index=True)

20201116-23.out.gz


In [5]:
len(results_df), len(preprocessed_texts)

(4250, 4250)

In [6]:
DATA_FILE = f"csv/social_distancing_results_df_{FILE_PATTERN}.csv"

results_df.to_csv(DATA_FILE)

## 2. Label tweets

In [7]:
import fasttext

In [8]:
BESTDIM = 200
BESTEPOCH = 200
BESTLR = 0.2
DISTANCE = "distance"
TOPIC = DISTANCE
MODELFILE = f"model-{TOPIC}-{BESTDIM}-{BESTEPOCH}-{BESTLR}.bin"

In [9]:
model = fasttext.load_model(MODELFILE)



In [10]:
labels = model.predict(preprocessed_texts)

In [11]:
results_df["label"] = [re.sub("^__label__", "", label[0]) for label in labels[0]]

In [12]:
relevant_results_df = results_df[~results_df["label"].isin(["IRRELEVANT"])].copy()

In [13]:
len(relevant_results_df)

2606

In [14]:
relevant_results_df.to_csv(DATA_FILE)

In [15]:
relevant_results_df = pd.read_csv(DATA_FILE)

In [16]:
relevant_results_df.iloc[0]

Unnamed: 0                                                                   1
id_str                                                     1327024788330729472
in_reply_to_status_id_str                                          1.32702e+18
user                                                               Sylly_itsme
verified                                                                   NaN
text                         @SweetLakeB @Erikkokkie @ineke_at @dennisvdber...
location                                                                   NaN
label                                                                 SUPPORTS
Name: 0, dtype: object

## 3. Get user id and time stamp from json files

In [17]:
import gzip
import json
import sys

In [18]:
BASE_DIR_JSON = "/home/erikt/media/20190525/files/cloud/twitter/"

In [19]:
tweet_data_keep = {}
for month in "202011".split():
    tweet_data = {}
    files = sorted(os.listdir(BASE_DIR_JSON + month))
    for file_name in files:
        if re.search(FILE_PATTERN, file_name):
            squeal(file_name)
            infile = gzip.open(BASE_DIR_JSON + month + "/" + file_name, "r")
            for line in infile:
                json_data = json.loads(line)
                tweet_data[int(json_data["id_str"])] = { "user_id": json_data["user"]["id_str"],
                                                         "created_at": json_data["created_at"] }
            infile.close()
    for i in range(0, len(relevant_results_df)):
        if relevant_results_df.iloc[i]['id_str'] in tweet_data:
            tweet_data_keep[relevant_results_df.iloc[i]['id_str']] = tweet_data[relevant_results_df.iloc[i]['id_str']]
    print(month, len(tweet_data_keep))

20201116-23.out.gz
202011 2606


In [20]:
len(tweet_data_keep)

2606

In [21]:
created_ats = []
user_ids = []
for i in range(0, len(relevant_results_df)):
    created_ats.append(tweet_data_keep[relevant_results_df.iloc[i]['id_str']]['created_at'])
    user_ids.append(tweet_data_keep[relevant_results_df.iloc[i]['id_str']]['user_id'])

In [22]:
relevant_results_df['created_at'] = created_ats
relevant_results_df['user_id'] = user_ids

In [23]:
relevant_results_df.to_csv("csv/social_distancing_relevant_results_df_extended_20211.csv")

In [24]:
relevant_results_df.iloc[0]

Unnamed: 0                                                                   1
id_str                                                     1327024788330729472
in_reply_to_status_id_str                                          1.32702e+18
user                                                               Sylly_itsme
verified                                                                   NaN
text                         @SweetLakeB @Erikkokkie @ineke_at @dennisvdber...
location                                                                   NaN
label                                                                 SUPPORTS
created_at                                      Thu Nov 12 23:05:39 +0000 2020
user_id                                                    1240663073276465152
Name: 0, dtype: object

## Extract municipalities

In [25]:
MUNICIPALITY_FILE_NIELSEN = "csv/municipalities-nielsen.csv"
MUNICIPALITY_FILE_WIKIPEDIA = "csv/municipalities-wikipedia.csv"

In [26]:
municipalities_df = pd.read_csv(MUNICIPALITY_FILE_WIKIPEDIA)
municipalities_nielsen_df = pd.read_csv(MUNICIPALITY_FILE_NIELSEN)

municipalities_dict = {}
for i in range(0, len(municipalities_df)):
    municipality = municipalities_df.iloc[i]['municipality'].strip().lower()
    if municipality not in municipalities_dict:
        municipalities_dict[municipality] = []
    municipalities_dict[municipality].append({'municipality':municipalities_df.iloc[i]['municipality'], 
                                              'province': municipalities_df.iloc[i]['province'],
                                              'country': 'Netherlands'})

In [27]:
province_names = "Drenthe Flevoland Friesland Gelderland Groningen Limburg Noord-Brabant Noord-Holland Overijssel Utrecht Zeeland Zuid-Holland".split()
provinces = {}
for province in province_names:
    provinces[province.lower()] = [{ "municipality": "", "province": province, "country": "Netherlands" }]

In [28]:
country_names = "Belgium France Germany Luxembourg Netherlands".split()
countries = {}
for country in country_names:
    countries[country.lower()] = [{ "municipality": "", "province": "", "country": country }]

In [29]:
location_translations = {
    "streefkerk": "Molenlanden",
    "hoogvliet": "Rotterdam",
    "pijnacker noord": "Pijnacker-Nootdorp",
    "scheveningen": "Den Haag",
    "dinther": "Bernheze",
    "near rotterdam the netherlands": "Rotterdam",
    "omgeving rotterdam": "Rotterdam",
    "zaandijk": "Zaanstad",
    "westerhoven": "Bergeijk",
    "grunn": "Groningen",
    "district arnhem, reinwich gld.": "Arnhem",
    "hollandscheveld": "Hoogeveen",
    "jordaan": "Amsterdam",
    "dokkum": "Noardeast-Fryslân",
    "mokum": "Amsterdam",
    "driebergen": "Utrechtse Heuvelrug",
    "volendam": "Edam-Volendam",
    "ter apel": "Westerwolde",
    "nederland, steenwijkerland": "Steenwijkerland",
    "nederland, maassluis": "Maassluis",
    "lutjegast": "Westerkwartier",
    "nl - spijkenisse": "Nissewaard",
    "wanneperveen": "Steenwijkerland",
    "holsloot 0613164463": "Coevorden",
    "lemmer": "De Friese Meren",
    "❌❌❌": "Amsterdam",
    "buitengebied brunssumerheide (": "Brunssum",
    "sellingen": "Westerwolde",
    "ochten": "Neder-Betuwe",
    "020": "Amsterdam",
    "amsterdam-noord": "Amsterdam",
    "kaatsheuvel": "Loon op Zand",
    "geboren rotturdammurt": "Rotterdam",
    "khunfang, eindhoven, jomtien": "Rotterdam",
    "rotterdam-ijsselmonde": "Rotterdam",
    "010": "Rotterdam",
    "the hague | austin, tx": "Den Haag",
    "sleen": "Coevorden",
    "de rijp": "Alkmaar",
    "monnickendam": "Waterland",
    "ginneken": "Breda",
    "@🏠, leeuwarden , friesland, 🇪🇺": "Leeuwarden",
    "oude noorden, rotterdam": "Rotterdam",
    "stiens": "Leeuwarden",
    "baarlo (lb)": "Peel en Maas",
    "zwaag": "Hoorn",
    "efteling": "Loon op Zand",
    "menaldum": "Waadhoeke",
    "glanerbrug": "Enschede",
    "040": "Eindhoven",
    "loon": "Assen",
    "mijdrecht": "De Ronde Venen",
    "duivendrecht": "Ouder-Amstel",
    "wezep": "Oldebroek",
    "metropool amsterdam": "Amsterdam",
    "regio alkmaar": "Alkmaar",
    "malden": "Heumen",
    "nieuwolda": "Oldambt",
    "074": "Hengelo",
    "netherlands, groningen": "Groningen",
    "init, amsterdam": "Amsterdam",
    "noordwolde, bedum": "Het Hogeland",
    "siegerswoude": "Opsterland",
    "soestdijk": "Soest",
    "rotterdam-capelle ad ijssel nl": "Rotterdam",
    "berkel en rodenrijs": "Lansingerland",
    "breskens, sluis & terneuzen": "Sluis",
    "lent, nijmegen": "Nijmegen",
    "hofstad": "Den Haag",
    "nijverdal": "Hellendoorn",
    "biddinghuizen": "Dronten",
    "noordwolde": "Weststellingwerf",
    "52.326147,4.855987": "Amsterdam",
    "schoonhoven": "Krimpenerwaard",
    "schoonhoven-oude stad": "Krimpenerwaard",
    "beilen": "Midden-Drenthe",
    "vleuten": "Utrecht",
    "nl-overbetuwe": "Overbetuwe",
    "made": "Drimmelen",
    "nederland, amsterdam": "Amsterdam",
    "roelofarendsveen": "Kaag en Braassem",
    "den-haag": "Den Haag",
    "iphone: 52.221008,6.895315": "Enschede",
    "the haque": "Den Haag",
    "amsterdam.z.o.": "Amsterdam",
    "eys": "Gulpen-Wittem",
    "valkenburg netherland": "Valkenburg aan de Geul",
    "annen, aa en hunze": "Aa en Hunze",
    "amerongen": "Utrechtse Heuvelrug",
    "kralingen-oost": "Rotterdam",
    "072": "Alkmaar",
    "vinkeveen": "De Ronde Venen",
    "overstegen-oost, doetinchem": "Doetinchem",
    "the netherlands rotterdam": "Rotterdam",
    "winschoten": "Oldambt",
    "twello": "Voorst",
    "bunschoten-spakenburg": "Bunschoten",
    "ÜT: 51.629194,5.311012".lower(): "Boxtel",
    "overijssel, deventer": "Deventer",
    "schaarsbergen, arnhem": "Arnhem",
    "marum": "Westerkwartier",
    "alphen aan den rijn. netherlands": "Alphen aan den Rijn",
    "drentse in groningen ☘️🌷🌳": "Groningen",
    "rijnsburg": "Katwijk",
    "anna paulowna": "Hollands Kroon",
    "onderbanken": "Beekdaelen",
    "nuth": "Beekdaelen",
    "oirsbeek, schinnen": "Beekdaelen",
    "wijdenes": "Drechterland",
    "centrum hoogkarspel": "Drechterland",
    "goose meren": "Gooise Meren",
    "aarle-rixtel": "Laarbeek",
    "lieshout": "Laarbeek",
    "hooge mierde": "Reusel-De Mierde",
    "mook": "Mook en Middelaar",
    "berg 22c nuenen T: 040 2831675": "Nuenen, Gerwen en Nederwetten",
    "nuenen-eeneind": "Nuenen, Gerwen en Nederwetten",
    "de tienden 48 5674 tb nuenen": "Nuenen, Gerwen en Nederwetten",
    "nuenen-noord": "Nuenen, Gerwen en Nederwetten",
    "sint odiliënberg": "Roerdalen",
    "gemeente sudwest fryslan": "Sudwest Fryslan",
    "hurdegarijp": "Tietjerksteradeel",
    "hurdegaryp": "Tietjerksteradeel",
    "appingedam": "Eemsdelta",
    "stad appingedam": "Eemsdelta",
    "appingedam (gr)": "Eemsdelta",
    "delfzijl": "Eemsdelta",
    "netherland, delfzijl": "Eemsdelta",
    "loppersum": "Eemsdelta",
    "the hague": "Den Haag",
    "hoofddorp": "Haarlemmermeer",
    "voorburg": "Leidschendam-Voorburg",
    "spijkenisse": "Nissewaard",
    "zaandam": "Zaanstad",
    "bussum": "Gooise Meren",
    "'s-gravenhage": "Den Haag",
    "drachten": "Smallingerland",
    "bodegraven": "Bodegraven-Reeuwijk",
    "leyden": "Leiden",
    "bilthoven": "De Bilt",
    "rosmalen": "'s-Hertogenbosch",
    "ijmuiden": "Velsen",
    "maarssen": "Stichtse Vecht",
    "den bosch": "'s-Hertogenbosch",
    "emmeloord": "Noordoostpolder",
    "leidschendam": "Leidschendam-Voorburg",
    "sneek": "Súdwest-Fryslân",
    "steenwijk": "Steenwijkerland",
    "súdwest fryslân": "Súdwest-Fryslân",
    "naarden": "Gooise Meren",
    "nieuw-lekkerland": "Molenlanden",
    "leerdam": "Vijfheerenlanden",
    "oud-beijerland": "Hoeksche Waard",
    "pijnacker": "Pijnacker-Nootdorp",
    "vianen": "Vijfheerenlanden",
    "voorburg oud, leidschendam-voo": "Leidschendam-Voorburg",
    "hoofddorp of elders": "Haarlemmermeer",
    "bedum": "Het Hogeland",
    "haren": "Groningen",
    "the hague netherlands": "Den Haag",
    "nuenen, gerwen en nederwetten, nederland": "Nuenen, Gerwen en Nederwetten",
    "noord holland": "Noord-Holland",
    "north holland": "Noord-Holland",
    "fryslân": "Friesland",
    "fryslan": "Friesland",
    "veluwe, gelderland, nederland": "Gelderland",
    "ijmond": "Noord-Holland",
    "thuis in west friesland": "Noord-Holland",
    "netherlands, noord-brabant": "Noord-Brabant",
    "north brabant": "Noord-Brabant",
    "south holland": "Zuid-Holland",
    "west gelderland": "Gelderland",
    "zuid holland": "Zuid-Holland",
    "nederland (nb) en spanje (gc)": "Noord-Brabant",
    "eu nederland utrecht n'gein": "Nieuwegein",
    "north-brabant": "Noord-Brabant",
    "haaglanden": "Zuid-Holland",
    "noord brabant": "Noord-Brabant",
    "zeeuws-vlaanderen": "Zeeland",
    "rijnmond. binnenkort buiten eu": "Zuid-Holland",
    "nederland, overijssel": "Overijssel",
    "nederland, den haag": "Den Haag",
    "twente": "Overijssel",
    "achterhoek": "Gelderland",
    "8erhoek": "Gelderland",
    "veluwe , gld": "Gelderland",
    "nederland": "Netherlands",
    "the netherlands": "Netherlands",
    "nl": "Netherlands",
    "holland": "Netherlands",
    "🇳🇱": "Netherlands",
    "🇱🇺": "Luxembourg",
    "belgie": "Belgium",
    "belgië": "Belgium",
    "be": "Belgium",
    "vlaanderen": "Belgium",
    "brussels": "Belgium",
    "brussel": "Belgium",
    "antwerpen": "Belgium",
    "gent": "Belgium",
    "ghent": "Belgium",
    "antwerp": "Belgium",
    "leuven": "Belgium",
    "brugge": "Belgium",
    "kortrijk": "Belgium",
    "mechelen": "Belgium",
    "broekzele": "Belgium",
    "berlin": "Germany",
    "berghem-noord, Oss": "Oss",
    "belgique": "Belgium",
}

## Anchor 1

In [30]:
def location_translate(location):
    if location.lower() in location_translations:
        return(location_translations[location.lower()])
    return(location)

In [31]:
def identify_location(location):
    location = location.strip()
    location = location_translate(location)
    location_stripped_max1 = re.sub("[,/|].*$", "", location).strip().lower()
    location_stripped_max2 = re.sub(" .*$", "", location).strip().lower()
    location_stripped_nl1 = re.sub(",* Ne[dt][eh].*$", "", location, flags=re.IGNORECASE).strip().lower()
    location_stripped_nl2 = re.sub(",* The Ne[dt][eh].*$", "", location, flags=re.IGNORECASE).strip().lower()
    location_stripped_nl3 = re.sub(",* NL.*$", "", location, flags=re.IGNORECASE).strip().lower()
    location_stripped_nl4 = re.sub(",* Holland.*$", "", location, flags=re.IGNORECASE).strip().lower()
    if location in municipalities_dict:
        return(municipalities_dict[location])
    elif location_stripped_max1 in municipalities_dict and not re.search("belg", location, flags=re.IGNORECASE):
        return(municipalities_dict[location_stripped_max1])
    elif location_stripped_nl1 in municipalities_dict:
        return(municipalities_dict[location_stripped_nl1])
    elif location_stripped_nl2 in municipalities_dict:
        return(municipalities_dict[location_stripped_nl2])
    elif location_stripped_nl3 in municipalities_dict:
        return(municipalities_dict[location_stripped_nl3])
    elif location_stripped_nl4 in municipalities_dict:
        return(municipalities_dict[location_stripped_nl4])
    elif location_translate(location_stripped_max1).lower() in municipalities_dict and not re.search("belg", location, flags=re.IGNORECASE):
        return(municipalities_dict[location_translate(location_stripped_max1).lower()])
    elif location_stripped_max2 in municipalities_dict and not re.search("belg", location, flags=re.IGNORECASE):
        return(municipalities_dict[location_stripped_max2])
    elif location_translate(location_stripped_max2).lower() in municipalities_dict and not re.search("belg", location, flags=re.IGNORECASE):
        return(municipalities_dict[location_translate(location_stripped_max2).lower()])
    elif location in provinces:
        return(provinces[location])
    elif location_stripped_max1 in provinces and not re.search("belg", location, flags=re.IGNORECASE):
        return(provinces[location_stripped_max1])
    elif location_stripped_nl1 in provinces:
        return(provinces[location_stripped_nl1])
    elif location_stripped_nl2 in provinces:
        return(provinces[location_stripped_nl2])
    elif location_stripped_nl3 in provinces:
        return(provinces[location_stripped_nl3])
    elif location_stripped_nl4 in provinces:
        return(provinces[location_stripped_nl4])
    elif location_translate(location_stripped_max1).lower() in provinces and not re.search("belg", location, flags=re.IGNORECASE):
        return(provinces[location_translate(location_stripped_max1).lower()])
    elif location_stripped_max2 in provinces and not re.search("belg", location, flags=re.IGNORECASE):
        return(provinces[location_stripped_max2])
    elif location_translate(location_stripped_max2).lower() in provinces and not re.search("belg", location, flags=re.IGNORECASE):
        return(provinces[location_translate(location_stripped_max2).lower()])
    elif location in countries:
        return(countries[location])
    elif location_stripped_max1 in countries and not re.search("belg", location, flags=re.IGNORECASE):
        return(countries[location_stripped_max1])
    elif location_stripped_nl1 in countries:
        return(countries[location_stripped_nl1])
    elif location_stripped_nl2 in countries:
        return(countries[location_stripped_nl2])
    elif location_stripped_nl3 in countries:
        return(countries[location_stripped_nl3])
    elif location_stripped_nl4 in countries:
        return(countries[location_stripped_nl4])
    elif location_translate(location_stripped_max1).lower() in countries:
        return(countries[location_translate(location_stripped_max1).lower()])
    elif location_stripped_max2 in countries and not re.search("belg", location, flags=re.IGNORECASE):
        return(countries[location_stripped_max2])
    elif location_translate(location_stripped_max2).lower() in countries:
        return(countries[location_translate(location_stripped_max2).lower()])
    elif re.search("(nederland|netherlands)", location, flags=re.IGNORECASE):
        return(countries["netherlands"])
    elif re.search("(belgie|belgië|belgium)", location, flags=re.IGNORECASE):
        return(countries["belgium"])
    return([])

In [32]:
municipalities_found = {}
municipalities_missed = {}
location_data = []
for i in range(0, len(relevant_results_df)):
    if i % 1000 == 0:
        squeal(i)
    if pd.isna(relevant_results_df.iloc[i]['location']):
        location_data.append([{}])
    else:
        location = re.sub(r'\s+', ' ', relevant_results_df.iloc[i]['location'].strip())
        if relevant_results_df.iloc[i]['location'] in municipalities_found:
            location_data.append(municipalities_found[relevant_results_df.iloc[i]['location']])
        elif location in municipalities_missed:
            municipalities_missed[location] += 1
            location_data.append([{}])
        else:
            identify_location_result = identify_location(location)
            if len(identify_location_result) > 0:
                municipalities_found[relevant_results_df.iloc[i]['location']] = identify_location_result
                location_data.append(identify_location_result)
            else:
                municipalities_missed[location] = 1
                location_data.append([{}])
squeal(i)

2605


In [33]:
len(location_data)

2606

In [34]:
no_municipality = 0
no_province = 0
no_country = 0
for data in location_data:
    if not "municipality" in data[0] or data[0]["municipality"] == "":
        no_municipality += 1
    if not "province" in data[0] or data[0]["province"] == "":
        no_province += 1
    if not "country" in data[0] or data[0]["country"] != "Netherlands":
        no_country += 1
print(no_municipality, no_province, no_country)

1884 1816 1502


In [35]:
139541/318858

0.43762740781162773

## Export

In [36]:
export_municipality = []
export_province = []
export_country = []
for data in location_data:
    if not "municipality" in data[0] or data[0]["municipality"] == "":
        export_municipality.append("")
    else:
        export_municipality.append(data[0]["municipality"])
    if not "province" in data[0] or data[0]["province"] == "":
        export_province.append("")
    else:
        export_province.append(data[0]["province"])
    if not "country" in data[0] or data[0]["country"] == "":
        export_country.append("")
    else:
        export_country.append(data[0]["country"])
        

In [37]:
export_results_df = relevant_results_df.copy()
export_results_df["municipality"] = export_municipality
export_results_df["province"] = export_province
export_results_df["country"] = export_country

In [38]:
export_results_df = export_results_df.drop(["Unnamed: 0", "text", "in_reply_to_status_id_str", "user", "verified"], axis=1)
export_results_df = export_results_df.reindex(columns=["label", "id_str", "user_id", "created_at", "location", "municipality", "province", "country"])
export_results_df.iloc[:20]

Unnamed: 0,label,id_str,user_id,created_at,location,municipality,province,country
0,SUPPORTS,1327024788330729472,1240663073276465152,Thu Nov 12 23:05:39 +0000 2020,,,,
1,REJECTS,1327025585575571456,1176818704455274496,Thu Nov 12 23:08:49 +0000 2020,Je moeders slaapkamer,,,
2,SUPPORTS,1327025864467386369,1244794721807085569,Thu Nov 12 23:09:55 +0000 2020,Earth,,,
3,SUPPORTS,1327026911248343041,20710302,Thu Nov 12 23:14:05 +0000 2020,,,,
4,SUPPORTS,1327028489430716416,45311611,Thu Nov 12 23:20:21 +0000 2020,,,,
5,REJECTS,1327031746190258179,104621048,Thu Nov 12 23:33:18 +0000 2020,,,,
6,SUPPORTS,1327032944909807617,1218312682786447360,Thu Nov 12 23:38:03 +0000 2020,,,,
7,SUPPORTS,1327038195335393288,121416229,Thu Nov 12 23:58:55 +0000 2020,"Warsaw, Poland",,,
8,REJECTS,1327038735813402624,377882157,Fri Nov 13 00:01:04 +0000 2020,Nederland,,,Netherlands
9,SUPPORTS,1327042341086105606,14220791,Fri Nov 13 00:15:24 +0000 2020,Don’t speak! Hide and seek!,,,


In [39]:
export_results_df.to_csv("social-distancing-202011.csv", index=False)

## Old code

In [289]:
checked = {'Nederland': 16217,'The Netherlands': 6874, 'Netherlands': 5744, 'Belgium': 1504, 'België': 1317, 'NL': 1062, 'Zuid-Holland, Nederland': 935, 'Holland': 818, 
           'Noord-Brabant, Nederland': 738, 'Antwerpen, België': 693, 'the Netherlands': 651, 'Noord-Holland, Nederland': 600, 'Friesland, Nederland': 520, 
           'Gelderland, Nederland': 492, 'Europe': 491, 'Naast t Huis voor #Klokkenluiders in #BeerputNederland': 470, 'nederland': 426, 'InACastleDarkOrAFortressStrong': 426,
           'Friesland': 390, 'netherlands': 378, 'Overijssel, Nederland': 359, 'Brussel, België': 340, 'Gent, België': 326, 'Twente': 309, 'Worldwide': 244, 'Earth': 222, 
           'Kaal/Altis/Tanoa': 219, 'Mechelen, België': 218, 'she/her': 217, 'Antwerp, Belgium': 212, 'Windroos': 208, 'Brussels, Belgium': 206, 'Noord-Brabant': 202, 'Curaçao': 201,
           'Brabant': 183, 'Ghent, Belgium': 173, 'Vlaanderen': 172, 'Achterhoek': 167, 'Leuven, België': 165, 'Zeeland': 160, 'Gelderland': 157, 'Zuid-Holland': 155, 
           'Nederland, Den Haag': 341, '🅢🅣🅞🅟 🅛🅔🅔🅡🅟🅛🅘🅒🅗🅣': 155, 'Fryslân': 154, 'holland': 154, '🇳🇱': 150, 'Nl': 150, 'Brugge, België': 149, 'Dutchland': 144, 
           'Sint-Petersburg, Rusland': 137, 'Zeeland, Nederland': 134, 'Everywhere': 133, 'Alpha Quadrant': 132, '10km from amsterdam': 126, 'Noord-Holland': 123, 
           'The Netherlands - Holland': 122, 'Overal en nergens': 122, 'Ausland.': 118, 'Planet Earth': 116, 'Noordeling': 116, 'Thuis': 112, 'the netherlands': 111, 
           'https://gab.ai/Ducktape': 111, 'Germany': 108, 'Holland, The Netherlands': 102, 'Flevoland, Nederland': 101, 'Overijssel': 100, 'World': 99, 'NETHERLANDS': 98,
           'doeslief': 98, 'Europa': 97, 'Europe, the World': 96, 'belgium': 94, 'Noord Holland': 93, 'Underground': 91, 'Ergens op de wereld': 88, 'Home': 88, 'Fryslan': 86, 
           'ENTER': 85, 'North Holland, The Netherlands': 84, 'Under the sea': 84, 'Clock Zero': 83, 'Leuven, Belgium': 80, 'France': 79,'https://gab.com/Master_Lars': 77, 
           'Dewolken': 77, 'Universe': 75, 'Kortrijk, België': 74, 'Spain': 74, '#verNederland': 73, 'World citizen': 72, 'NVT': 71, 'Bananenrepubliek Nederland': 71, 
           'Somewhere': 68, 'Gent, Belgium': 68, 'Oostende, België': 68, 'overijssel': 68, 'Veluwe, Gelderland, Nederland': 67, '21 k from Amsterdam': 67,
           'Nederlands Neanderthalië': 66, 'earth': 66, 'Gonnersdorf': 66, 'Mother Earth': 66, 'Frankrijk': 66, 'Zuid-Holland Zuid': 66, 'Mechelen, Belgium': 66, 'Op Aarde': 65, 
           'Riding in a rocket to a planet of sound': 65, 'broekzele': 64, 'Somewhere over the rainbow': 64, 'Berlin': 64, '': 63, 'WorldWide Take No Vaccin!!': 63, 
           'Earth,The One and Only Truth': 63, 'IJmond': 62, 'London, England': 62, 'Thuis in West Friesland': 62, 'redactie@looopings.nl': 61, 'Cyberspace': 61, 
           'Andalusia,Cartama, Spain': 61, 'Netherlands, Noord-Brabant': 61, 'www.BNMO.nl': 60, 'North Brabant, The Netherlands': 60, 'Panic Room': 60, 'Bananenrepubliek NL': 59, 
           'Belgie, Vlaanderen': 59, 'Usan, Scotland': 59, 'de aarde': 58, 'Nederland / USA': 58, 'Belgique': 57, 'Out there': 56, 'I vakreste byen Oslo i Norge': 55, 'The World': 55, 
           'Hilbert space': 55, 'Lier, België': 55, '🤡🌍': 55, 'Gezond verstand! ❤': 54, 'woutveldhoen@hotmail.com': 54, 'Home is where my ❤ is..': 54, 'Holland.': 54, 'Nederland/Spanje': 54, 
           'Planet Trump': 53, 'Mook': 53, 'thuis': 53, 'world': 53, 'Netherands': 52, 'New York, USA': 52, 'Home: where macbook meets wifi': 51, 'Platteland Zweden': 51, 
           'he/him': 51, 'Hasselt, België': 51, 'South Holland, The Netherlands': 50, 'Koksijde, België': 50, 'Roeselare, België': 49, 'Griekenland/Nederland': 49, 'Jahannam': 49, 
           'Nowhere': 48, 'Vlaming in hart en nieren': 48, 'gelderland': 48, 'Hier en daar': 48, 'https://youtu.be/k6xE7gyXjvE': 47, 'Flevoland': 47, 'overal en nergens': 47, 
           'Aarde': 47, 'Berghem-Noord, Oss': 46, 'West Gelderland, Nederland': 46, 'Bollenstreek': 46, 'Veluwe , Gld': 46, 'Hier': 45, 'EU': 45, 'Online': 45, 'Aalst, België': 45, 
           'Vandaag hier, morgen elders.': 43, 'Snuisterdrecht': 43, "Aan 't roer, Grote Roerganger": 43, '8erhoek': 43, 'Hamaland, land v/ heide & duin': 43, 
           'Somewhere in The Netherlands': 43, 'Everywere': 42, 'Zuid Holland': 42, 'Wonderland': 42, 'Internet': 42, 'Hollandistan, EUSSR vassal': 41, 'West Coast, Netherlands': 41, 
           'All around': 41, 'de mooiste stad van Nederland.': 41, 'Platteland': 41, 'Here, there and everywhere': 40, 'Kingdom of The Netherlands': 40, 'NL - Mainly Dutch tweets': 40, 
           'Open inrichting Nederland': 40, 'Brasschaat, België': 39, 'Westerlo, België': 39, '8000m2': 39, 'Nederland (NB) en Spanje (GC)': 39, 'de luis in je bruine pels': 39, 
           'Zeeland, The Netherlands': 39, '(Tweets onder Pers.titel)': 39, 'Leuven Belgium': 39, 'Genk, België': 39, 'bestuurssecretariaat@': 39, 'Hic sunt dragones.': 39, 
           'Siegerswoude': 39, 'Gaza': 39, 'Check out this link ⤵️': 39, "Eu Nederland Utrecht N'gein": 38, 'Gebrook': 38, 'on the planet': 38, '127.0.0.1': 38, 'Kalmthout, België': 38, 
           'transhoek': 38, '24 landen': 38, 'Tribal inland territories': 38, 'The world.': 37, 'Gyhum, Deutschland': 37, 'Brazilië': 37, 'Halle, België': 37, 'Istanbul': 37, 
           'North-Brabant, Netherlands': 37, 'Los Santos': 37, 'Oost Nederland': 37, 'North Africa': 37, 'Randstad': 37, 'ergens en nergens': 37, 'Rainforrest': 37, 'Wuhan': 37, 
           'Netherland': 36, 'Oud Turnhout': 36, 'planet earth': 36, 'THE Netherlands': 36, 'Anti EU': 36, 'Knokke-Heist, België': 36, 'In quarantaine of in Efteling': 36, 
           'Holanda (Países Baixos)': 36, 'Spanje': 35, 'boeskoolcity': 35, 'Deinze, België': 35, 'wonderland': 35, 'Op de Zeebodem': 35, 'Boven groningen': 35, 'Gates of Hell': 35, 
           "'t Gooi": 34, 'N': 34, 'Haaglanden': 34, 'Cuba': 34, 'Op Jacht': 34, 'The Free World': 34, 'aarde': 34, 'Nederland, Overijssel': 34, 'Ergens in Nederland': 34, 
           '🏠Most of the time at home': 34, 'https://mastodon.social/users/': 34, 'de wereld': 34, 'Sint-Niklaas, België': 34, 'In de tuin': 34, 'The Netherlands.': 33, 
           'dierenvriend,heb🦌🐕🐓🦆🦙🦜🦚': 33, '🌍': 33, 'Bruges, Belgium': 33, 'Belgie': 33, 'Italië': 33, 'Blog:': 33, 'Waar ze friet zeggen': 33, 'the four corners of the Earth.': 32, 
           'Aruba': 32, 'Joods/Christelijk Westen': 32, 'Oost nederland': 32, 'Wat ooit Nederland was..': 32, 'Tienen, België': 32, 'Noord Nederland': 32, 'prov... groningen n': 32, 
           'Ook op Gab Geen ongevraagd dm': 32, 'Rijnmond. Binnenkort buiten EU': 32, '🇱🇺': 31, 'Noord Brabant': 31, 'De provincie.': 31, 'earthling,Twente,🇪🇺,da W🕸️B': 31, 
           'Berlijn, Duitsland': 31, 'Beveren, België': 31, 'Multiversum': 31, 'North Yorkshire': 31, 'wonend in buitenland': 30, 'México': 30, 'Overijssel Nederland': 30, 
           'Pjongyang': 30, 'Suriname Zuid Amerika': 30, 'Mihi vita Spica Virginis': 30, 'Zeeuws-Vlaanderen': 30, 'Zee / NL ,FI, FR ,RUS': 30, 'Ergens op deze Aardbol': 29, 
           'somewhere': 29, 'Katrijn B.V.': 29, 'Antwerpen': 268, 'Drenthe': 244, 'Limburg, Nederland': 210, 'Gent': 207, 'Brussels': 196, 'Brussel': 176, 'Brugge': 130, 'Limburg': 121,
           'Leuven': 90, 'Antwerp': 80, 'Mechelen': 66, 'drenthe': 64, 'Ghent': 46, 'Hasselt': 46, 'Brasschaat': 35, 'Halle, Hinterecke': 35, 'GENT': 33,
          }

In [304]:
municipalities_found_names = list(set([x[0]["municipality"] for x in list(municipalities_found.values()) if x[0]['country'] == "Netherlands"]))
for municipality in municipalities_dict:
    if municipalities_dict[municipality][0]['country'] == "Netherlands" and municipalities_dict[municipality][0]['municipality'] not in municipalities_found_names:
        print(municipalities_dict[municipality][0])

KeyError: 0

In [291]:
for location in municipalities_missed:
    if re.search("loppers", location, flags=re.IGNORECASE):
        print(location)

## Anchor 2

In [321]:
{ m: municipalities_missed[m] for m in sorted(municipalities_missed.keys(), key=lambda m: municipalities_missed[m], reverse=True)} # if m not in checked }

{'Antwerpen, België': 693,
 'Europe': 491,
 'Naast t Huis voor #Klokkenluiders in #BeerputNederland': 470,
 'InACastleDarkOrAFortressStrong': 426,
 'Brussel, België': 340,
 'Gent, België': 326,
 'Antwerpen': 268,
 'Worldwide': 244,
 'Earth': 222,
 'Kaal/Altis/Tanoa': 219,
 'Mechelen, België': 218,
 'she/her': 217,
 'Antwerp, Belgium': 212,
 'Windroos': 208,
 'Gent': 207,
 'Brussels, Belgium': 206,
 'Curaçao': 201,
 'Brussels': 196,
 'Brabant': 183,
 'Brussel': 176,
 'Ghent, Belgium': 173,
 'Leuven, België': 165,
 '🅢🅣🅞🅟 🅛🅔🅔🅡🅟🅛🅘🅒🅗🅣': 155,
 'Brugge, België': 149,
 'Dutchland': 144,
 'Sint-Petersburg, Rusland': 137,
 'Everywhere': 133,
 'Alpha Quadrant': 132,
 'Brugge': 130,
 '10km from amsterdam': 126,
 'The Netherlands - Holland': 122,
 'Overal en nergens': 122,
 'Ausland.': 118,
 'Planet Earth': 116,
 'Noordeling': 116,
 'Thuis': 112,
 'https://gab.ai/Ducktape': 111,
 'World': 99,
 'doeslief': 98,
 'Europa': 97,
 'Europe, the World': 96,
 'Underground': 91,
 'Leuven': 90,
 'Ergens op de

In [92]:
len(municipalities_found)

665