In [None]:
from matplotlib import pyplot as plt
from collections import Counter, defaultdict
import pandas as pd
import networkx as nx
import datetime
import psycopg2
import pickle
import os
import re

import warnings

warnings.filterwarnings("ignore")

%load_ext nb_black

In [None]:
def connect():
    conn = psycopg2.connect(
        user="postgres",
        password=os.environ.get("POSTGRES_PASS", ""),
        host="localhost",
        port=5432,
        database="venmo",
    )
    return conn


def reduce_graph(old_graph, f):
    new_graph = nx.Graph()
    for a, b, data in old_graph.edges(data=True):
        if f(data):
            new_graph.add_edge(a, b, **data)
    return new_graph


def find_coords(place, cache):
    found = cache.get(place)
    if found:
        return (found.latitude, found.longitude)
    location = geolocator.geocode(place)
    if location is None:
        return None
    cache[place] = location
    return (location.latitude, location.longitude)


def parse_geo_tokens(geoparser, raw_msg, cache):
    msg = re.sub(r"[^\w\d_\- ]", "", raw_msg).strip()
    if len(msg) == 0:
        return []
    found = cache.get(msg)
    if found:
        return found
    result = geoparser.geoparse(msg)
    cache[msg] = result
    return result


with open("user_id_to_loc.pkl", "rb") as f:
    user_id_to_loc_saved = pickle.load(f)
with open("geo_cache.pkl", "rb") as f:
    geo_cache = pickle.load(f)
known_user_ids = set(user_id_to_loc_saved)
with open("covid_words.pkl", "rb") as f:
    covid_words = pickle.load(f)
geoparser_cache = {}

In [None]:
graph = nx.Graph()

query = """
SELECT 
    LEAST(actor_user_id, recipient_id),
    GREATEST(actor_user_id, recipient_id),
    array_agg(id),
    array_agg(created),
    array_agg(message)
FROM 
    transactions
WHERE 
    created > '2020-02-01'
GROUP BY 
    GREATEST(actor_user_id, recipient_id),
    LEAST(actor_user_id, recipient_id)
"""

conn = connect()
with conn.cursor(name="clusters") as cursor:
    cursor.itersize = 500
    cursor.execute(query)
    for i, (a, b, ids, createds, msgs) in enumerate(cursor):
        if i % 1_000_000 == 0 and i != 0:
            print("Checkpoint @", i)
            with open("cluster_graph.pkl", "wb") as f:
                pickle.dump(graph, f)
        graph.add_edge(a, b, weight=len(ids), dates=createds, msgs=msgs)

with open("cluster_graph.pkl", "wb") as f:
    pickle.dump(graph, f)

conn.close()

In [None]:
with open("cluster_graph.pkl", "rb") as f:
    graph_saved = pickle.load(f)

In [None]:
def filter_edge(data):
    return len(data["msgs"]) >= 4


rgraph = reduce_graph(graph_saved, filter_edge)

In [None]:
sub_graphs = nx.connected_components(rgraph)
cluster_df_data = defaultdict(list)

for i, sg in enumerate(sub_graphs):

    if i % 100_000 == 0:
        print("@ Subgraph", i)

    known_loc_overlap = sg & known_user_ids
    cluster = rgraph.subgraph(sg)

    msgs = []
    dates = []
    covid_cnt = 0
    edges = 0
    transactions = 0
    for _, _, edge_msgs in cluster.edges.data("msgs"):
        edges += 1
        for m in edge_msgs:
            transactions += 1
            for token in covid_words:
                if token in m:
                    covid_cnt += 1
        msgs.extend(edge_msgs)
    for _, _, edge_dates in cluster.edges.data("dates"):
        dates.extend(edge_dates)

    cluster_df_data["size"].append(len(sg))
    cluster_df_data["edges_cnt"].append(edges)
    cluster_df_data["transactions_cnt"].append(transactions)
    cluster_df_data["covid_cnt"].append(covid_cnt)
    cluster_df_data["msgs"].append(msgs)
    cluster_df_data["dates"].append(dates)
    cluster_df_data["known_overlap"].append(len(known_loc_overlap))
    cluster_df_data["known_overlap_ids"].append(list(known_loc_overlap))

cluster_df = pd.DataFrame(cluster_df_data)
with open("cluster_data.pkl", "wb") as f:
    pickle.dump(cluster_df_data, f)

In [None]:
from mordecai import Geoparser
from geopy.geocoders import Nominatim

with open("cluster_data.pkl", "rb") as f:
    cluster_df = pd.DataFrame(pickle.load(f))
with open("user_id_to_loc.pkl", "rb") as f:
    user_id_to_loc_saved = pickle.load(f)

geo = Geoparser()
geolocator = Nominatim(user_agent="sshh12/venmo-research")

In [None]:
idxs = []
lats = []
lngs = []
place_tokens = []


def save_checkpoint():
    print("Saving...")
    with open("cluster_locs_data.pkl", "wb") as f:
        pickle.dump((idxs, lats, lngs, place_tokens), f)
    with open("geo_cache.pkl", "wb") as f:
        pickle.dump(geo_cache, f)
    with open("geoparser_cache.pkl", "wb") as f:
        pickle.dump(geoparser_cache, f)


for idx, row in cluster_df.iterrows():
    size = row["size"]
    msgs = row["msgs"]
    known_ids = row["known_overlap_ids"]
    lat, lng, ptokens = None, None, []
    if len(known_ids) > 0:
        assume_loc = [user_id_to_loc_saved[u] for u in known_ids][0]
        lat, lng, _ = assume_loc
    else:
        if size < 100:
            locs = []
            for m in msgs:
                locs.extend(parse_geo_tokens(geo, m, geoparser_cache))
            places = [
                (item["word"], item["geo"]["admin1"]) for item in locs if "geo" in item
            ]
            ptokens.extend([p[0] for p in places])
            c = Counter([p[1] for p in places])
            if len(places) > 0:
                loc_name = c.most_common(1)[0][0]
                loc_coords = find_coords(loc_name, geo_cache)
                if loc_coords is not None:
                    lat, lng = loc_coords
    lats.append(lat)
    lngs.append(lng)
    place_tokens.append(ptokens)
    idxs.append(idx)
    if idx % 10000 == 0 and idx > 0:
        save_checkpoint()

save_checkpoint()

In [None]:
with open("cluster_locs_data.pkl", "rb") as f:
    idxs, lats, lngs, place_tokens = pickle.load(f)
with open("cluster_data.pkl", "rb") as f:
    cluster_df = pd.DataFrame(pickle.load(f)).iloc[idxs]
with open("user_id_to_loc.pkl", "rb") as f:
    user_id_to_loc_saved = pickle.load(f)
cluster_df["lat"] = lats
cluster_df["lng"] = lngs
cluster_df["place_tokens"] = place_tokens
cluster_df_clean = cluster_df.dropna()

In [None]:
"""
Total Clusters: 902451
Clusters w/latlng: 78613 8.711054672220431 %
Mean cluster size: 10.188238586493329
"""
print("Total Clusters:", len(cluster_df))
print(
    "Clusters w/latlng:",
    len(cluster_df_clean),
    len(cluster_df_clean) / len(cluster_df) * 100,
    "%",
)
print("Mean cluster size:", cluster_df_clean["size"].mean())

In [None]:
cluster_df_clean["size"].clip(0, 80).plot.hist(bins=80, title="Group Sizes")

In [None]:
cluster_df_clean["covid_cnt"].clip(0, 4).plot.hist(
    title="COVID Token Counts Per Group", xticks=[0, 1, 2, 3, 4, 5]
)

In [None]:
STATE_TO_ABBR = {
    "Alabama": "AL",
    "Alaska": "AK",
    "American Samoa": "AS",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "District of Columbia": "DC",
    "Florida": "FL",
    "Georgia": "GA",
    "Guam": "GU",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Northern Mariana Islands": "MP",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Puerto Rico": "PR",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virgin Islands": "VI",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
}

In [None]:
from shapely.geometry import Point
import geoplot.crs as gcrs
import geoplot as gplt
import geopandas as gpd

world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
contiguous_usa = gpd.read_file(gplt.datasets.get_path("contiguous_usa"))
usa = world[world.name == "United States of America"]

cases_df = pd.read_csv("United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")
cases_df["Date"] = cases_df.submission_date.apply(
    lambda date: datetime.datetime.strptime(date, "%m/%d/%Y").timestamp()
)
cases_df = cases_df[cases_df.Date < 1602720000]
total_cases = cases_df.groupby("state")[["tot_cases"]].last()
contiguous_usa["abbr"] = contiguous_usa.state.apply(STATE_TO_ABBR.__getitem__)
contiguous_usa["covid_cases"] = contiguous_usa.abbr.apply(
    lambda a: total_cases.loc[a]["tot_cases"]
)

gdf = gpd.GeoDataFrame(
    cluster_df_clean,
    geometry=gpd.points_from_xy(cluster_df_clean.lng, cluster_df_clean.lat),
)
usa_gdf = gdf.loc[gdf.within(usa.iloc[0]["geometry"])].copy()

regions = len(contiguous_usa)
cluster_cnts = [0 for _ in range(regions)]
covid_cnts = [0 for _ in range(regions)]
sum_size = [0 for _ in range(regions)]
sum_trans = [0 for _ in range(regions)]
trans_dates = [[] for _ in range(regions)]
verified_trans_dates = [[] for _ in range(regions)]
for idx, row in usa_gdf.iterrows():
    if row["size"] > 1000:
        continue
    state_match = None
    for state_idx, state in contiguous_usa.iterrows():
        if state["geometry"].contains(Point(row["lng"], row["lat"])):
            state_match = state_idx
            break
    if state_match is None:
        continue
    cluster_cnts[state_match] += 1
    covid_cnts[state_match] += row["covid_cnt"]
    sum_size[state_match] += row["size"]
    sum_trans[state_match] += row["transactions_cnt"]
    trans_dates[state_match].extend(row["dates"])
    if row["known_overlap"] > 0:
        verified_trans_dates[state_match].extend(row["dates"])

contiguous_usa["cluster_cnts"] = cluster_cnts
contiguous_usa["covid_cnts"] = covid_cnts
contiguous_usa["sum_size"] = sum_size
contiguous_usa["sum_trans"] = sum_trans
contiguous_usa["trans_dates"] = trans_dates
contiguous_usa["verified_trans_dates"] = verified_trans_dates

cluster_trans_by_state = defaultdict(list)
for idx, row in contiguous_usa.iterrows():
    cluster_trans_by_state[row["state"]].extend(row["verified_trans_dates"])
with open("transactions_by_state_cluster.pkl", "wb") as f:
    pickle.dump(cluster_trans_by_state, f)

print(round(len(usa_gdf) / len(gdf) * 100), "% in USA")

In [None]:
contiguous_usa["avg_clust_size"] = (
    contiguous_usa["sum_size"] / contiguous_usa["cluster_cnts"]
)
contiguous_usa["avg_clust_trans"] = (
    contiguous_usa["sum_trans"] / contiguous_usa["cluster_cnts"]
)
contiguous_usa["covidtok_per_cluster"] = (
    contiguous_usa["covid_cnts"] / contiguous_usa["cluster_cnts"]
)
contiguous_usa["covidtok_per_pop"] = (
    contiguous_usa["covid_cnts"] / contiguous_usa["population"]
)
contiguous_usa["cases_per_pop"] = (
    contiguous_usa["covid_cases"] / contiguous_usa["population"]
)
contiguous_usa["clusters_per_pop"] = (
    contiguous_usa["cluster_cnts"] / contiguous_usa["population"]
)
contiguous_usa["covidtok_per_cases"] = (
    contiguous_usa["covid_cnts"] / contiguous_usa["covid_cases"]
)
state_corr = contiguous_usa.corr()
state_corr

In [None]:
ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator(), figsize=(16, 16))
gplt.pointplot(usa_gdf, ax=ax).set_title("Venmo Clusters")

In [None]:
gplt.choropleth(
    contiguous_usa,
    hue="covid_cases",
    projection=gcrs.AlbersEqualArea(),
    edgecolor="white",
    linewidth=1,
    cmap="Purples",
    legend=True,
    scheme="FisherJenks",
    legend_kwargs={"loc": "lower left"},
).set_title("Cumulative COVID Cases")
gplt.choropleth(
    contiguous_usa,
    hue="covid_cnts",
    projection=gcrs.AlbersEqualArea(),
    linewidth=1,
    cmap="Greens",
    legend=True,
    scheme="FisherJenks",
    legend_kwargs={"loc": "lower left"},
).set_title("Cumulative COVID Keyword Mentions")
gplt.choropleth(
    contiguous_usa,
    hue="avg_clust_size",
    projection=gcrs.AlbersEqualArea(),
    linewidth=1,
    cmap="Blues",
    legend=True,
    scheme="FisherJenks",
    legend_kwargs={"loc": "lower left"},
).set_title("Mean Group Size")