In [None]:
from matplotlib import pyplot as plt
from collections import Counter, defaultdict
import pandas as pd
import networkx as nx
import datetime
import psycopg2
import pickle
import os
import re

import warnings

warnings.filterwarnings("ignore")

%load_ext nb_black

In [None]:
def connect():
    conn = psycopg2.connect(
        user="postgres",
        password=os.environ.get("POSTGRES_PASS", ""),
        host="localhost",
        port=5432,
        database="venmo",
    )
    return conn


def reduce_graph(old_graph, f):
    new_graph = nx.Graph()
    for a, b, data in old_graph.edges(data=True):
        if f(data):
            new_graph.add_edge(a, b, **data)
    return new_graph


def find_coords(place, cache):
    found = cache.get(place)
    if found:
        return (found.latitude, found.longitude)
    location = geolocator.geocode(place)
    if location is None:
        return None
    cache[place] = location
    return (location.latitude, location.longitude)


def parse_geo_tokens(geoparser, raw_msg):
    msg = re.sub(r"[^\w\d_\- ]", "", raw_msg).strip()
    if len(msg) == 0:
        return []
    return geoparser.geoparse(msg)

In [None]:
graph = nx.Graph()

query = """
SELECT 
    LEAST(actor_user_id, recipient_id),
    GREATEST(actor_user_id, recipient_id),
    array_agg(id),
    array_agg(created),
    array_agg(message)
FROM 
    transactions
WHERE 
    created > '2020-03-10'
GROUP BY 
    GREATEST(actor_user_id, recipient_id),
    LEAST(actor_user_id, recipient_id)
"""

conn = connect()
with conn.cursor(name="clusters") as cursor:
    cursor.itersize = 500
    cursor.execute(query)
    for i, (a, b, ids, createds, msgs) in enumerate(cursor):
        if i % 1_000_000 == 0 and i != 0:
            print("Checkpoint @", i)
            with open("cluster_graph.pkl", "wb") as f:
                pickle.dump(graph, f)
        graph.add_edge(a, b, weight=len(ids), dates=createds, msgs=msgs)

with open("cluster_graph.pkl", "wb") as f:
    pickle.dump(graph, f)

conn.close()

In [None]:
with open("cluster_graph.pkl", "rb") as f:
    graph_saved = pickle.load(f)

In [None]:
with open("user_id_to_loc.pkl", "rb") as f:
    user_id_to_loc_saved = pickle.load(f)
with open("geo_cache.pkl", "rb") as f:
    geo_cache = pickle.load(f)
known_user_ids = set(user_id_to_loc_saved)

COVID_WORDS = [
    "diagnosed",
    "pneumonia",
    "coronavirus",
    "fever",
    "covid",
    "isolating",
    "quarantine",
    "cough",
    "sick",
    "social distancing",
    "self isolat",
    "self-isolat",
]

In [None]:
def filter_edge(data):
    return len(data["msgs"]) >= 4


rgraph = reduce_graph(graph_saved, filter_edge)

In [None]:
sub_graphs = nx.connected_components(rgraph)
cluster_df_data = defaultdict(list)

for i, sg in enumerate(sub_graphs):

    if i % 100_000 == 0:
        print("@ Subgraph", i)

    known_loc_overlap = sg & known_user_ids
    cluster = rgraph.subgraph(sg)

    msgs = []
    covid_cnt = 0
    edges = 0
    transactions = 0
    for _, _, edge_msgs in cluster.edges.data("msgs"):
        edges += 1
        for m in edge_msgs:
            transactions += 1
            for token in COVID_WORDS:
                if token in m:
                    covid_cnt += 1
        msgs.extend(edge_msgs)

    cluster_df_data["size"].append(len(sg))
    cluster_df_data["edges_cnt"].append(edges)
    cluster_df_data["transactions_cnt"].append(transactions)
    cluster_df_data["covid_cnt"].append(covid_cnt)
    cluster_df_data["msgs"].append(msgs)
    cluster_df_data["known_overlap"].append(len(known_loc_overlap))
    cluster_df_data["known_overlap_ids"].append(list(known_loc_overlap))

cluster_df = pd.DataFrame(cluster_df_data)
with open("cluster_data.pkl", "wb") as f:
    pickle.dump(cluster_df_data, f)

In [None]:
from mordecai import Geoparser
from geopy.geocoders import Nominatim

with open("cluster_data.pkl", "rb") as f:
    cluster_df = pd.DataFrame(pickle.load(f))
with open("user_id_to_loc.pkl", "rb") as f:
    user_id_to_loc_saved = pickle.load(f)

geo = Geoparser()
geolocator = Nominatim(user_agent="sshh12/venmo-research")

In [None]:
lats = []
lngs = []
place_tokens = []
for idx, row in cluster_df.iterrows():
    size = row["size"]
    msgs = row["msgs"]
    known_ids = row["known_overlap_ids"]
    lat, lng, ptokens = None, None, []
    if len(known_ids) > 0:
        assume_loc = [user_id_to_loc_saved[u] for u in known_ids][0]
        lat, lng, _ = assume_loc
    else:
        if size < 100:
            locs = []
            for m in msgs:
                locs.extend(parse_geo_tokens(geo, m))
            places = [
                (item["word"], item["geo"]["admin1"]) for item in locs if "geo" in item
            ]
            ptokens.extend([p[0] for p in places])
            c = Counter([p[1] for p in places])
            if len(places) > 0:
                loc_name = c.most_common(1)[0][0]
                loc_coords = find_coords(loc_name, geo_cache)
                if loc_coords is not None:
                    lat, lng = loc_coords
    lats.append(lat)
    lngs.append(lng)
    place_tokens.append(ptokens)

with open("cluster_locs_data.pkl", "wb") as f:
    pickle.dump((lats, lngs, place_tokens), f)

In [None]:
with open("cluster_data.pkl", "rb") as f:
    cluster_df = pd.DataFrame(pickle.load(f))
with open("cluster_locs_data.pkl", "rb") as f:
    lats, lngs, place_tokens = pickle.load(f)
with open("user_id_to_loc.pkl", "rb") as f:
    user_id_to_loc_saved = pickle.load(f)
cluster_df["lat"] = lats
cluster_df["lng"] = lngs
cluster_df["place_tokens"] = place_tokens
cluster_df_clean = cluster_df.dropna()

In [None]:
print(
    len(cluster_df_clean),
    len(cluster_df),
    len(cluster_df_clean) / len(cluster_df) * 100,
)
print(cluster_df["size"].mean())

In [None]:
STATE_TO_ABBR = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}


In [None]:
from shapely.geometry import Point
import geoplot.crs as gcrs
import geoplot as gplt
import geopandas as gpd

world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
contiguous_usa = gpd.read_file(gplt.datasets.get_path("contiguous_usa"))
usa = world[world.name == "United States of America"]

total_cases = (
    pd.read_csv("United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")
    .groupby("state")[["tot_cases"]]
    .last()
)
contiguous_usa["abbr"] = contiguous_usa.state.apply(STATE_TO_ABBR.__getitem__)
contiguous_usa["total_covid_cases"] = contiguous_usa.abbr.apply(
    lambda a: total_cases.loc[a]["tot_cases"]
)

gdf = gpd.GeoDataFrame(
    cluster_df_clean,
    geometry=gpd.points_from_xy(cluster_df_clean.lng, cluster_df_clean.lat),
)

usa_gdf = gdf.loc[gdf.within(usa.iloc[0]["geometry"])].copy()

cluster_cnts = [0 for _ in range(len(contiguous_usa))]
covid_cnts = [0 for _ in range(len(contiguous_usa))]
for idx, row in usa_gdf.iterrows():
    state_match = None
    for state_idx, state in contiguous_usa.iterrows():
        if state["geometry"].contains(Point(row["lng"], row["lat"])):
            state_match = state_idx
            break
    if state_match is None:
        continue
    cluster_cnts[state_match] += 1
    covid_cnts[state_match] += row["covid_cnt"]
contiguous_usa["cluster_cnts"] = cluster_cnts
contiguous_usa["covid_cnts"] = covid_cnts
contiguous_usa["mentions_per_cluster"] = (
    contiguous_usa["covid_cnts"] / contiguous_usa["cluster_cnts"]
)
contiguous_usa["cases_per_population"] = (
    contiguous_usa["total_covid_cases"] / contiguous_usa["population"]
)

print(round(len(usa_gdf) / len(gdf) * 100), "% in USA")

In [None]:
ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator(), figsize=(16, 16))
gplt.pointplot(usa_gdf, ax=ax).set_title("Venmo Clusters")

In [None]:
gplt.choropleth(
    contiguous_usa,
    hue="covid_cnts",
    projection=gcrs.AlbersEqualArea(),
    edgecolor="white",
    linewidth=1,
    cmap="Reds",
    legend=True,
    scheme="FisherJenks",
).set_title("COVID Mentions")
gplt.choropleth(
    contiguous_usa,
    hue="cluster_cnts",
    projection=gcrs.AlbersEqualArea(),
    edgecolor="white",
    linewidth=1,
    cmap="Blues",
    legend=True,
    scheme="FisherJenks",
).set_title("Venmo Clusters")
gplt.choropleth(
    contiguous_usa,
    hue="mentions_per_cluster",
    projection=gcrs.AlbersEqualArea(),
    edgecolor="white",
    linewidth=1,
    cmap="Purples",
    legend=True,
    scheme="FisherJenks",
).set_title("COVID Mentions Per Venmo Cluster")
gplt.choropleth(
    contiguous_usa,
    hue="cases_per_population",
    projection=gcrs.AlbersEqualArea(),
    edgecolor="white",
    linewidth=1,
    cmap="Greys",
    legend=True,
    scheme="FisherJenks",
).set_title("Cases Per Population")