# Build social distancing data from tweets

Required fields:

1. tweet id
2. user id
3. time stamp
4. municipality
5. stance with respect to social distancing

## 1. Select social distancing tweets

In [2]:
import os
import pandas as pd
import re
from nltk.tokenize import TweetTokenizer
from IPython.display import clear_output

In [3]:
BASE_DIR = "../data/text/"
DISTANCE_QUERY = "1[.,]5[ -]*m|afstand.*hou|hou.*afstand|anderhalve[ -]*meter"
FILE_PATTERN = "^(20201001)"
TEXT = "text"

In [4]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)


def cleanup(text):
    text = re.sub(r"\\n"," ",text)
    text = re.sub(r"https://\S+","",text)
    text = re.sub(r"\s+"," ",text)
    text = text.strip()
    return(text)


def tokenize(text):
    return(" ".join(TweetTokenizer().tokenize(text)))


def preprocess(text):
    return(tokenize(cleanup(text)).lower())

In [5]:
query = DISTANCE_QUERY
files = sorted(os.listdir(BASE_DIR))
results_df = pd.DataFrame({})
preprocessed_texts = []
for file_name in files:
    if re.search(FILE_PATTERN, file_name):
        squeal(file_name)
        file_data = pd.read_csv(BASE_DIR + file_name)
        matched_text = file_data[file_data[TEXT].str.contains(query, case=False)]
        preprocessed_texts.extend(matched_text[TEXT].apply(lambda x: preprocess(x)))
        if len(results_df) == 0:
            results_df = matched_text.copy()
        else:
            results_df = pd.concat([results_df, matched_text], ignore_index=True)

20201001-23.out.gz


In [6]:
len(results_df), len(preprocessed_texts)

(3489, 3489)

In [7]:
DATA_FILE = f"csv/social_distancing_results_df_{FILE_PATTERN}.csv"

results_df.to_csv(DATA_FILE)

## 2. Label tweets

In [8]:
import fasttext

In [9]:
BESTDIM = 200
BESTEPOCH = 200
BESTLR = 0.2
DISTANCE = "distance"
TOPIC = DISTANCE
MODELFILE = f"model-{TOPIC}-{BESTDIM}-{BESTEPOCH}-{BESTLR}.bin"

In [10]:
model = fasttext.load_model(MODELFILE)



In [11]:
labels = model.predict(preprocessed_texts)

In [12]:
results_df["label"] = [re.sub("^__label__", "", label[0]) for label in labels[0]]

In [13]:
relevant_results_df = results_df[~results_df["label"].isin(["IRRELEVANT"])].copy()

In [14]:
len(relevant_results_df)

2674

In [15]:
relevant_results_df.to_csv(DATA_FILE)

In [16]:
relevant_results_df = pd.read_csv(DATA_FILE)

In [17]:
relevant_results_df.iloc[0]

Unnamed: 0                                                                   0
id_str                                                     1311425852169744385
in_reply_to_status_id_str                                          1.31135e+18
user                                                                 filakimou
verified                                                                   NaN
text                         @NUnl Waarom niet de dove mens het mondkapje o...
location                                                         Petra, Greece
label                                                                 SUPPORTS
Name: 0, dtype: object

## 3. Get user id and time stamp from json files

In [18]:
import gzip
import json
import sys

In [19]:
BASE_DIR_JSON = "/home/erikt/media/20190525/files/cloud/twitter/"

In [20]:
tweet_data_keep = {}
for month in "202010".split():
    tweet_data = {}
    files = sorted(os.listdir(BASE_DIR_JSON + month))
    for file_name in files:
        if re.search(FILE_PATTERN, file_name):
            squeal(file_name)
            infile = gzip.open(BASE_DIR_JSON + month + "/" + file_name, "r")
            for line in infile:
                json_data = json.loads(line)
                if "quoted_status_id_str" in json_data:
                    quoted_status_id_str = json_data["quoted_status_id_str"]
                else:
                    quoted_status_id_str = ""
                if "extended_tweet" in json_data and "full_text" in json_data["extended_tweet"]:
                    full_text = json_data["extended_tweet"]["full_text"]
                else:
                    full_text = ""
                tweet_data[int(json_data["id_str"])] = { 
                                                         "created_at": json_data["created_at"],
                                                         "id_str": json_data["id_str"],
                                                         "entities": json_data["entities"],
                                                         "text": json_data["text"],
                                                         "full_text": full_text,
                                                         "in_reply_to_user_id_str": json_data["in_reply_to_user_id_str"],
                                                         "in_reply_to_status_id_str": json_data["in_reply_to_status_id_str"],
                                                         "quoted_status_id_str": quoted_status_id_str,
                                                         "user": {
                                                             "id_str": json_data["user"]["id_str"],
                                                             "name": json_data["user"]["name"],
                                                             "screen_name": json_data["user"]["screen_name"],
                                                             "verified": json_data["user"]["verified"],
                                                             "followers_count": json_data["user"]["followers_count"],
                                                         },
                                                       }
            infile.close()
    for i in range(0, len(relevant_results_df)):
        if relevant_results_df.iloc[i]['id_str'] in tweet_data:
            tweet_data_keep[relevant_results_df.iloc[i]['id_str']] = tweet_data[relevant_results_df.iloc[i]['id_str']]
    print(month, len(tweet_data_keep))

20201001-23.out.gz
202010 2674


In [21]:
len(tweet_data_keep)

2674

In [23]:
tweet_data_keep[list(tweet_data_keep.keys())[0]]