In [None]:
from matplotlib import pyplot as plt
from IPython.display import display, HTML
from collections import Counter
import seaborn as sns
import pandas as pd
import datetime
import random
import psycopg2
import pickle
import requests
import math
import os
import re

import warnings

warnings.filterwarnings("ignore")

%load_ext nb_black

In [None]:
def connect():
    conn = psycopg2.connect(
        user="postgres",
        password=os.environ.get("POSTGRES_PASS", ""),
        host="localhost",
        port=5432,
        database="venmo",
    )
    return conn

In [None]:
COVID_WORDS = [
    # Machine Learning to Detect Self-Reporting of Symptoms, Testing Access, and Recovery Associated With COVID-19 on Twitter
    "covid",
    "diagnosed",
    "pneumonia",
    "coronavirus",
    "fever",
    "test",
    "symptoms",
    "isolating",
    "cough",
    "emergency room",
    # Extras
    "isolating",
    "quarantine",
    "sick",
    "social distancing",
    "self isolat",
    "mask",
]
with open("covid_words.pkl", "wb") as f:
    pickle.dump(COVID_WORDS, f)

In [None]:
meta = Counter()

# Store timestamps
covid_token_usage = {}
covid_token_refs = {}
covid_token_examples = {}
for w in COVID_WORDS:
    covid_token_usage[w] = []
    covid_token_examples[w] = []
    covid_token_refs[w] = Counter()

conn = connect()
with conn.cursor(name="covid_exploration") as cursor:
    cursor.itersize = 2000
    cursor.execute("SELECT * FROM transactions")
    for i, row in enumerate(cursor):

        if i % 2_000_000 == 0:
            # checkpoint
            print("Row", i)
            with open("covid_tokens.pkl", "wb") as f:
                pickle.dump(
                    (covid_token_usage, covid_token_refs, covid_token_examples), f
                )
            with open("covid_meta.pkl", "wb") as f:
                pickle.dump(meta, f)

        msg = row[1]
        try:
            msg = re.sub(r"[^\w\d_\- ]", "", msg).strip().replace("-", " ")
            meta["msgs"] += 1
            if len(msg) == 0:
                continue
        except:
            continue
        meta["msgs_processed"] += 1

        try:
            ts = int(row[4].timestamp())
        except:
            continue
        meta["msgs_ts_processed"] += 1

        for token in COVID_WORDS:
            if token in msg:
                meta["covid_tokens_found"] += 1
                covid_token_usage[token].append(ts)
                covid_token_examples[token].append(row[1])
                refs = covid_token_refs[token]
                for word in msg.split(" "):
                    refs[word] += 1

conn.close()

In [None]:
# Checkpoint
with open("covid_tokens.pkl", "rb") as f:
    (
        covid_token_usage_saved,
        covid_token_refs_saved,
        covid_token_examples_saved,
    ) = pickle.load(f)
with open("covid_meta.pkl", "rb") as f:
    meta_saved = pickle.load(f)
meta_saved

In [None]:
# 71%
meta_saved["msgs_ts_processed"] / meta_saved["msgs"]

In [None]:
# 0.016%
meta_saved["covid_tokens_found"] / meta_saved["msgs"]

In [None]:
# Nothing too interesting here
for token, cntr in covid_token_refs_saved.items():
    print(token)
    print(cntr.most_common(20))

In [None]:
df_data = {"token": [], "Date": []}
for token, usage_ts in covid_token_usage_saved.items():
    for ts in usage_ts:
        df_data["token"].append(token)
        df_data["Date"].append(ts)
df = pd.DataFrame(df_data)
# Start at the end of 2019
df = df[df["Date"] > 1575158400]
df = df[df["Date"] < 1602720000]

In [None]:
cases_df = pd.read_csv("United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")
cases_df = cases_df[["submission_date", "new_case"]]
cases_df = cases_df.groupby("submission_date").sum()
cases_df["US Daily Cases"] = cases_df.new_case.rolling(7).mean()
cases_df = cases_df.dropna().reset_index()
cases_df["Date"] = cases_df.submission_date.apply(
    lambda date: datetime.datetime.strptime(date, "%m/%d/%Y").timestamp()
)
cases_df = cases_df[cases_df["Date"] < 1602720000]

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
fig.suptitle("COVID-19 Tokens In Transactions", fontsize=16)
sns.histplot(
    df[df["token"].isin(["covid", "coronavirus"])],
    x="Date",
    hue="token",
    ax=ax1,
)
sns.histplot(
    df[df["token"].isin(["quarantine"])],
    x="Date",
    hue="token",
    ax=ax2,
)
sns.histplot(
    df[df["token"].isin(["cough", "pneumonia", "fever", "symptoms"])],
    x="Date",
    hue="token",
    ax=ax3,
)
sns.histplot(
    df[df["token"].isin(["self isolat", "isolating", "social distancing"])],
    x="Date",
    hue="token",
    ax=ax4,
)
ticks = ax1.get_xticks()
for a in [ax1, ax2, ax3, ax4]:
    a.set_xticks(ticks)
    a.set_xticklabels(
        [datetime.datetime.fromtimestamp(ts).isoformat()[:10] for ts in ticks]
    )
    sns.lineplot(
        data=cases_df,
        x="Date",
        y="US Daily Cases",
        ax=a.twinx(),
        color="red",
    )
fig.tight_layout()

In [None]:
TOKENS_OF_INTEREST = ["social distancing", "quarantine", "covid", "cough"]
K = 5
df_data = {"phrase": [], "examples": []}
for token in TOKENS_OF_INTEREST:
    if len(covid_token_examples_saved[token]) < K:
        continue
    sample_usage = random.sample(covid_token_examples_saved[token], K)
    df_data["phrase"].append(token)
    df_data["examples"].append("\n".join(sample_usage))
df = pd.DataFrame(df_data)
df.set_index("phrase")


def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n", "<br>")))


pretty_print(df)