# Maps

Generate maps from tweet data

In [1]:
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import os
import pandas as pd
import re

from IPython.display import clear_output

In [2]:
BE = "BE"
COORDINATESFILE = "coordinates.csv"
COUNTRY = "country"
DATADIRLOCATIONS = "../data/locations/"
DATADIRTEXT = "../data/text/"
IDSTR = "id_str"
LATITUDE = "latitude"
LOCATION = "location"
LOCATIONLOWER = "location_lower"
LONGITUDE = "longitude"
NL = "NL"
SCREENNAME = "screenname"
TEXT = "text"
UNKNOWN = "-"
USER = "user"

In [3]:
MINFIRSTLOCATIONPARTLEN = 8

def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)

def read_coordinates():
    df = pd.read_csv(DATADIRLOCATIONS+COORDINATESFILE)
    return(df[df[COUNTRY]==NL])

def read_user_locations(user_location_file):
    return(pd.read_csv(DATADIRLOCATIONS+user_location_file,index_col=SCREENNAME))

def add_clean_locations(user_locations):
    locations = []
    for i in range(0,len(user_locations)):
        first_field_slash = str(user_locations.iloc[i][COUNTRY]).split("/")[0].strip()
        first_field_hyphen = first_field_slash.split(" - ")[0].strip()
        first_field_comma = first_field_hyphen.split(",")[0].strip()
        try: first_field_space = str(first_field_comma).split()[0]
        except: first_field_space = ""
        if len(first_field_space) >= MINFIRSTLOCATIONPARTLEN: first_field_comma = first_field_space
        first_field_comma = re.sub(" *(nederland|the netherlands|netherlands|\(nl\)|belgië|belgie|belgium|\(be\))$","",\
                                   first_field_comma,flags=re.IGNORECASE)
        first_field_comma = re.sub("[.?!:;]*$","",first_field_comma)
        locations.append(first_field_comma.lower())
    user_locations[LOCATIONLOWER] = locations
    return(user_locations)

def read_tweets(file_pattern):
    file_name_list = sorted(os.listdir(DATADIRTEXT))
    tweets_list = []
    for file_name in file_name_list:
        if re.search(file_pattern,file_name):
            squeal(file_name)
            tweets_list.append(pd.read_csv(DATADIRTEXT+file_name,index_col=IDSTR,compression="gzip"))
    return(pd.concat(tweets_list).drop_duplicates())

In [4]:
MONTH = "202007"

tweets = read_tweets(MONTH)

20200731-23.out.gz


In [5]:
LOCATIONSFILE = f"locations-{MONTH}.csv"

user_locations = read_user_locations(LOCATIONSFILE)
user_locations = user_locations[user_locations[COUNTRY] != UNKNOWN]
user_locations = add_clean_locations(user_locations)

In [6]:
coordinates = read_coordinates()
coordinates[LOCATIONLOWER] = [location.lower() for location in coordinates[LOCATION]]
coordinates = coordinates.set_index(LOCATIONLOWER)
user_locations_with_coordinates = user_locations[user_locations[LOCATIONLOWER].isin(coordinates.index)]

In [None]:
#QUERYTOPIC = "corona|covid|mondkapje|rivm|blijfthuis|houvol|huisarts|flattenthecurve"
#QUERYTOPIC = "mondkapje"
QUERYTOPIC = "1[.,]5[ -]*m|afstand.*hou|hou.*afstand|anderhalve[ -]*meter"
#QUERYTOPIC = ""

location_counts = {}
counter = 0
if QUERYTOPIC == "":
    selected_tweets = tweets[tweets[USER].isin(user_locations_with_coordinates.index)]
    selected_tweet_groups = selected_tweets.groupby([USER])
    for user in selected_tweet_groups.groups:
        location = user_locations_with_coordinates.loc[user][LOCATIONLOWER]
        if not location in location_counts: location_counts[location] = 0
        location_counts[location] += len(selected_tweet_groups.groups[user])
        counter += 1
        if counter % 1000 == 0: squeal(counter)        
else:
    regex = re.compile(QUERYTOPIC,flags=re.IGNORECASE)
    df = tweets[tweets[USER].isin(user_locations_with_coordinates.index)]
    for i in range(0,len(df)):
        if regex.search(str(df.iloc[i][TEXT])):
            user = df.iloc[i][USER]
            location = user_locations_with_coordinates.loc[user][LOCATIONLOWER]
            if not location in location_counts: location_counts[location] = 1
            else: location_counts[location] += 1
            counter += 1
            if counter % 100 == 0: squeal(counter)        
squeal(counter)

location_counts = {location:location_counts[location] 
                   for location in sorted(location_counts.keys(),key=lambda l:location_counts[l],reverse=True)}

2000


In [None]:
print(f"found: {sum(location_counts.values())} tweets; coverage: {round(sum(location_counts.values())/len(tweets)*100,1)}%")

In [None]:
def make_plot_data(location_counts,coordinates):
    x = []
    y = []
    data_values = []
    labels = []
    seen = {}
    for location in location_counts:
        key = str(coordinates.loc[location][LONGITUDE])+" "+str(coordinates.loc[location][LATITUDE])
        if not key in seen:
            if key != "nan nan":
                x.append(coordinates.loc[location][LONGITUDE])
                y.append(coordinates.loc[location][LATITUDE])
                data_values.append(location_counts[location])
                labels.append(location)
                seen[key] = len(x)-1
        else:
            data_values[seen[key]] = data_values[seen[key]]+location_counts[location]
    return(x,y,data_values,labels)

In [None]:
x,y,data_values,labels = make_plot_data(location_counts,coordinates)
len(data_values)

In [None]:
def find_missing_locations(x,y,coordinates):
    keys = {}
    for i in range(0,len(x)):
        key = " ".join([str(x[i]),str(y[i])])
        keys[key] = True
    for location in coordinates.index:
        key = " ".join([str(coordinates.loc[location][LONGITUDE]),str(coordinates.loc[location][LATITUDE])])
        if not key in keys.keys(): print(location)
find_missing_locations(x,y,coordinates)

In [None]:
[(data_values[i],labels[i]) for i in sorted(range(0,len(data_values)),key=lambda j:data_values[j],reverse=True)][:10]

In [None]:
[(data_values[i],labels[i],int(coordinates.loc[labels[i]]["population_size"])) \
                               for i in sorted(range(0,len(data_values)),key=lambda j:data_values[j],reverse=True)][-20:]

In [None]:
BLOBFACTOR = 0.01*778888/location_counts["amsterdam"]
FONTSIZE = 7
PLOTFILENAME = f"map-{MONTH}.png"

img_netherlands = mpimg.imread("nederland.png")
plt.figure(figsize=(12.5,15))
plt.imshow(img_netherlands,alpha=0.4,extent=[3.3,7.2,50.75,53.55],aspect="auto")
plt.scatter(x,y,s=[data_value*BLOBFACTOR for data_value in data_values],alpha=0.6)
seen = {}
for i in range(0,len(labels)): plt.annotate(labels[i],(x[i],y[i]),fontsize=FONTSIZE)
plt.savefig(PLOTFILENAME)
plt.show()

In [None]:
import matplotlib

PLOTFILENAME = f"correlation-{MONTH}.png"

font = {"size":14}
matplotlib.rc("font",**font)

x_correlation = []
y_correlation = []
labels_correlation = []
for i in range(0,len(data_values)):
    location = labels[i]
    nbr_of_tweets = data_values[i]
    population_size = int(coordinates.loc[location]["population_size"])
    if population_size == population_size: 
        x_correlation.append(population_size)
        y_correlation.append(nbr_of_tweets)
        labels_correlation.append(location)

plt.figure(figsize=(6,4))
plt.scatter(x_correlation,y_correlation)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("population size")
plt.ylabel("number of tweets")
plt.tight_layout()
plt.savefig(PLOTFILENAME)
plt.show()

## time-consuming...

In [None]:
def get_locations_without_coordinates():
    user_locations_without_coordinates = user_locations[~user_locations[LOCATIONLOWER].isin(coordinates.index)]

    missing_location_counts = {}
    counter = 0
    selected_tweets = tweets[tweets[USER].isin(user_locations_without_coordinates.index)]
    selected_tweet_groups = selected_tweets.groupby([USER])
    for user in selected_tweet_groups.groups:
        location = user_locations_without_coordinates.loc[user][LOCATIONLOWER]
        if not location in missing_location_counts: missing_location_counts[location] = 0
        missing_location_counts[location] += len(selected_tweet_groups.groups[user])
        counter += 1
        if counter % 1000 == 0: squeal(counter)        
    return(missing_location_counts)

In [None]:
#missing_location_counts = get_locations_without_coordinates()
#{location:missing_location_counts[location] \
#    for location in sorted(missing_location_counts.keys(),key=lambda l:missing_location_counts[l],reverse=True)}