In [None]:
from matplotlib import pyplot as plt
from collections import Counter, defaultdict
import pandas as pd
import networkx as nx
import datetime
import psycopg2
import pickle
import os
import re

import warnings

warnings.filterwarnings("ignore")

%load_ext nb_black

In [None]:
def connect():
    conn = psycopg2.connect(
        user="postgres",
        password=os.environ.get("POSTGRES_PASS", ""),
        host="localhost",
        port=5432,
        database="venmo",
    )
    return conn


def reduce_graph(old_graph, f):
    new_graph = nx.Graph()
    for a, b, data in old_graph.edges(data=True):
        if f(data):
            new_graph.add_edge(a, b, **data)
    return new_graph


def find_coords(place, cache):
    found = cache.get(place)
    if found:
        return (found.latitude, found.longitude)
    location = geolocator.geocode(place)
    if location is None:
        return None
    cache[place] = location
    return (location.latitude, location.longitude)


def parse_geo_tokens(geoparser, raw_msg):
    msg = re.sub(r"[^\w\d_\- ]", "", raw_msg).strip()
    if len(msg) == 0:
        return []
    return geoparser.geoparse(msg)

In [None]:
graph = nx.Graph()

query = """
SELECT 
    LEAST(actor_user_id, recipient_id),
    GREATEST(actor_user_id, recipient_id),
    array_agg(id),
    array_agg(created),
    array_agg(message)
FROM 
    transactions
WHERE 
    created > '2020-03-10'
GROUP BY 
    GREATEST(actor_user_id, recipient_id),
    LEAST(actor_user_id, recipient_id)
"""

conn = connect()
with conn.cursor(name="clusters") as cursor:
    cursor.itersize = 500
    cursor.execute(query)
    for i, (a, b, ids, createds, msgs) in enumerate(cursor):
        if i % 1_000_000 == 0 and i != 0:
            print("Checkpoint @", i)
            with open("cluster_graph.pkl", "wb") as f:
                pickle.dump(graph, f)
        graph.add_edge(a, b, weight=len(ids), dates=createds, msgs=msgs)

with open("cluster_graph.pkl", "wb") as f:
    pickle.dump(graph, f)

conn.close()

In [None]:
with open("cluster_graph.pkl", "rb") as f:
    graph_saved = pickle.load(f)
with open("user_id_to_loc.pkl", "rb") as f:
    user_id_to_loc_saved = pickle.load(f)
with open("geo_cache.pkl", "rb") as f:
    geo_cache = pickle.load(f)
known_user_ids = set(user_id_to_loc_saved)

In [None]:
def at_least_two(data):
    return len(data["dates"]) >= 3


ng = reduce_graph(graph_saved, at_least_two)

In [None]:
sub_graphs = nx.connected_components(graph_saved)
sub_graph_sizes = []
for sg in sub_graphs:
    if sg & known_user_ids:
        print(sg)
    #         cluster = ng.subgraph(sg)
    #         print(cluster.edges.data())
    # sub_graph_sizes.append(len(sg))
# plt.hist(sub_graph_sizes)
# plt.show()

In [None]:
from mordecai import Geoparser

geo = Geoparser()


In [None]:
sub_graphs = nx.connected_components(graph_saved)
for sg in sub_graphs:
    if len(sg) > 100:
        continue
    cluster = ng.subgraph(sg)
    msgs = []
    for _, _, edge_msgs in cluster.edges.data("msgs"):
        msgs.extend(edge_msgs)
    locs = []
    for m in msgs:
        locs.extend(parse_geo_tokens(geo, m))
    places = [(item["token", item["geo"]["admin1"]) for item in locs if "geo" in item]
    if len(places) > 0:
        print(places)