# Build social distancing data from tweets

See email 15-07-2021 13:03

## 1. Select social distancing tweets

In [1]:
import os
import numpy as np
import pandas as pd
import re
from nltk.tokenize import TweetTokenizer
from IPython.display import clear_output

In [2]:
BASE_DIR = "../data/text/"
DISTANCE_QUERY = "1[.,]5[ -]*m|afstand.*hou|hou.*afstand|anderhalve[ -]*meter"
FILE_PATTERN = "^(202010)"
TEXT = "text"

In [3]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)


def cleanup(text):
    text = re.sub(r"\\n"," ",text)
    text = re.sub(r"https://\S+","",text)
    text = re.sub(r"\s+"," ",text)
    text = text.strip()
    return(text)


def tokenize(text):
    return(" ".join(TweetTokenizer().tokenize(text)))


def preprocess(text):
    return(tokenize(cleanup(text)).lower())

In [4]:
query = DISTANCE_QUERY
files = sorted(os.listdir(BASE_DIR))
results_df = pd.DataFrame({})
preprocessed_texts = []
for file_name in files:
    if re.search(FILE_PATTERN, file_name):
        squeal(file_name)
        file_data = pd.read_csv(BASE_DIR + file_name)
        matched_text = file_data[file_data[TEXT].str.contains(query, case=False)]
        preprocessed_texts.extend(matched_text[TEXT].apply(lambda x: preprocess(x)))
        if len(results_df) == 0:
            results_df = matched_text.copy()
        else:
            results_df = pd.concat([results_df, matched_text], ignore_index=True)

20201031-23.out.gz


In [5]:
len(results_df), len(preprocessed_texts)

(76183, 76183)

In [6]:
DATA_FILE = f"csv/social_distancing_results_df_{FILE_PATTERN}.csv"

results_df.to_csv(DATA_FILE)

## 2. Label tweets

In [7]:
import fasttext

In [8]:
BESTDIM = 200
BESTEPOCH = 200
BESTLR = 0.2
DISTANCE = "distance"
TOPIC = DISTANCE
MODELFILE = f"model-{TOPIC}-{BESTDIM}-{BESTEPOCH}-{BESTLR}.bin"

In [9]:
model = fasttext.load_model(MODELFILE)



In [10]:
labels = model.predict(preprocessed_texts, k=3)

In [11]:
def shorten_label(label):
    return re.sub("^__label__", "", label)

In [12]:
results_df["label"] = [shorten_label(label[0]) for label in labels[0]]

In [13]:
results_df["probabilities"] = [ { shorten_label(labels[0][i][j]): labels[1][i][j] for j in range(0, 3)}
                                for i in range(0, len(labels[0])) ]

In [14]:
relevant_results_df = results_df[~results_df["label"].isin(["IRRELEVANT"])].copy()

In [15]:
len(relevant_results_df)

56173

In [16]:
relevant_results_df.to_csv(DATA_FILE)

In [17]:
relevant_results_df = pd.read_csv(DATA_FILE)

In [18]:
relevant_results_df.iloc[0]

Unnamed: 0                                                                   0
id_str                                                     1311425852169744385
in_reply_to_status_id_str                                          1.31135e+18
user                                                                 filakimou
verified                                                                   NaN
text                         @NUnl Waarom niet de dove mens het mondkapje o...
location                                                         Petra, Greece
label                                                                 SUPPORTS
probabilities                {'SUPPORTS': 0.98213124, 'IRRELEVANT': 0.01485...
Name: 0, dtype: object

## 3. Get sentiment scores

In [19]:
SENTIMENT_DIR = "../data/sentiment/pattern/"
files = sorted(os.listdir(SENTIMENT_DIR))
for file_name in files:
    if re.search(FILE_PATTERN, file_name):
        squeal(file_name)
        file_data = pd.read_csv(os.path.join(SENTIMENT_DIR, file_name), header=None).astype({0: str}).set_index(0)
        for id_str in relevant_results_df["id_str"]:
            if str(id_str) in file_data.index:
                relevant_results_df["sentiment"] = file_data.loc[str(id_str)][1]

20201031-23.out.gz


In [20]:
for index, row in relevant_results_df.iterrows():
    if "sentiment" not in row or pd.isna(row["sentiment"]):
        print(row)

In [21]:
relevant_results_df.to_csv(DATA_FILE)

In [29]:
relevant_results_df = relevant_results_df.set_index("id_str")
relevant_results_df.index

KeyError: "None of ['id_str'] are in the columns"

## 4. Get user id and time stamp from json files

In [22]:
import gzip
import json
import sys

In [23]:
BASE_DIR_JSON = "/home/erikt/media/20190525/files/cloud/twitter/"

In [37]:
tweet_data_keep = {}
for month in "202010".split():
    tweet_data = {}
    files = sorted(os.listdir(BASE_DIR_JSON + month))
    for file_name in files:
        if re.search(FILE_PATTERN, file_name):
            squeal(f"{file_name} {len(tweet_data_keep)}")
            infile = gzip.open(BASE_DIR_JSON + month + "/" + file_name, "r")
            for line in infile:
                json_data = json.loads(line)
                if int(json_data["id_str"]) in relevant_results_df.index:
                    if "quoted_status_id_str" in json_data:
                        quoted_status_id_str = json_data["quoted_status_id_str"]
                    else:
                        quoted_status_id_str = ""
                    if "extended_tweet" in json_data and "full_text" in json_data["extended_tweet"]:
                        full_text = json_data["extended_tweet"]["full_text"]
                    else:
                        full_text = ""
                    tweet_data_keep[int(json_data["id_str"])] = { 
                                                                 "created_at": json_data["created_at"],
                                                                 "id_str": json_data["id_str"],
                                                                 "entities": json_data["entities"],
                                                                 "text": json_data["text"],
                                                                 "full_text": full_text,
                                                                 "in_reply_to_user_id_str": json_data["in_reply_to_user_id_str"],
                                                                 "in_reply_to_status_id_str": json_data["in_reply_to_status_id_str"],
                                                                 "quoted_status_id_str": quoted_status_id_str,
                                                                 "user": {
                                                                     "id_str": json_data["user"]["id_str"],
                                                                     "name": json_data["user"]["name"],
                                                                     "screen_name": json_data["user"]["screen_name"],
                                                                     "verified": json_data["user"]["verified"],
                                                                     "followers_count": json_data["user"]["followers_count"],
                                                                 },
                                                               }
            infile.close()

20201031-23.out.gz 56119


In [38]:
len(tweet_data_keep)

56173

In [48]:
tweet_data_keep[list(tweet_data_keep.keys())[0]]

In [45]:
for id_str in relevant_results_df.index:
    tweet_data_keep[id_str]["label_social_distancing"] = relevant_results_df.loc[id_str]["label"]
    tweet_data_keep[id_str]["probabilities_social_distancing"] = relevant_results_df.loc[id_str]["probabilities"]
    tweet_data_keep[id_str]["sentiment"] = relevant_results_df.loc[id_str]["sentiment"]

In [47]:
OUT_FILE_NAME = "social-distancing-student.json"
out_file = open(OUT_FILE_NAME, "w")
for id_str in tweet_data_keep:
    print(json.dumps(tweet_data_keep[id_str]), file=out_file)
out_file.close()