In [None]:
from matplotlib import pyplot as plt
from collections import Counter
import seaborn as sns
import pandas as pd
import datetime
import psycopg2
import pickle
import requests
import math
import os
import re

import warnings

warnings.filterwarnings("ignore")

%load_ext nb_black

In [None]:
def connect():
    conn = psycopg2.connect(
        user="postgres",
        password=os.environ.get("POSTGRES_PASS", ""),
        host="localhost",
        port=5432,
        database="venmo",
    )
    return conn

In [None]:
# Partly from Machine Learning to Detect Self-Reporting of Symptoms, Testing Access, and Recovery Associated With COVID-19 on Twitter
COVID_WORDS = [
    "diagnosed",
    "pneumonia",
    "coronavirus",
    "fever",
    "covid",
    "isolating",
    "quarantine",
    "emergency room",
]

In [None]:
meta = Counter()

# Store timestamps
covid_token_usage = {}
covid_token_refs = {}
for w in COVID_WORDS:
    covid_token_usage[w] = []
    covid_token_refs[w] = Counter()

conn = connect()
with conn.cursor(name="covid_exploration") as cursor:
    cursor.itersize = 2000
    cursor.execute("SELECT * FROM transactions")
    for i, row in enumerate(cursor):

        if i % 1_000_000 == 0:
            # checkpoint
            print("Row", i)
            with open("covid_tokens.pkl", "wb") as f:
                pickle.dump((covid_token_usage, covid_token_refs), f)
            with open("covid_meta.pkl", "wb") as f:
                pickle.dump(meta, f)

        msg = row[1]
        try:
            msg = re.sub(r"[^\w\d_\- ]", "", msg).strip()
            meta["msgs"] += 1
            if len(msg) == 0:
                continue
        except:
            continue
        meta["msgs_processed"] += 1

        try:
            ts = int(row[4].timestamp())
        except:
            continue
        meta["msgs_ts_processed"] += 1

        for token in COVID_WORDS:
            if token in msg:
                meta["covid_tokens_found"] += 1
                covid_token_usage[token].append(ts)
                refs = covid_token_refs[token]
                for word in msg.split(" "):
                    refs[word] += 1

conn.close()

In [None]:
# Checkpoint
with open("covid_tokens.pkl", "rb") as f:
    covid_token_usage_saved, covid_token_refs_saved = pickle.load(f)
with open("covid_meta.pkl", "rb") as f:
    meta_saved = pickle.load(f)
meta_saved

In [None]:
# 71%
meta_saved["msgs_ts_processed"] / meta_saved["msgs"]

In [None]:
# 0.014%
meta_saved["covid_tokens_found"] / meta_saved["msgs"]

In [None]:
# Nothing that interesting here
for token, cntr in covid_token_refs_saved.items():
    print(token)
    print(cntr.most_common(20))

In [None]:
df_data = {"token": [], "Date": []}
for token, usage_ts in covid_token_usage_saved.items():
    for ts in usage_ts:
        df_data["token"].append(token)
        df_data["Date"].append(ts)
df = pd.DataFrame(df_data)
# Start at the end of 2019
df = df[df["Date"] > 1575158400]

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
sns.histplot(df, x="Date", hue="token", ax=ax).set_title("COVID Words")
_ = ax.set_xticklabels(
    [datetime.datetime.fromtimestamp(ts).isoformat()[:10] for ts in ax.get_xticks()]
)