In [None]:
from matplotlib import pyplot as plt
from collections import Counter, defaultdict
import seaborn as sns
import pandas as pd
import psycopg2
import geoplot.crs as gcrs
import geoplot as gplt
import geopandas as gpd
import pickle
import os

import warnings

warnings.filterwarnings("ignore")

%load_ext nb_black

In [None]:
# https://github.com/geopy/geopy
#  $ pip install geopy
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="sshh12/venmo-research")

if os.path.isfile("geo_cache.pkl"):
    print("Using existing geo cache")
    with open("geo_cache.pkl", "rb") as f:
        geo_cache = pickle.load(f)
else:
    geo_cache = {}


def find_coords(place, cache):
    found = cache.get(place)
    if found:
        return (found.latitude, found.longitude)
    location = geolocator.geocode(place)
    if location is None:
        return None
    cache[place] = location
    return (location.latitude, location.longitude)

In [None]:
def connect():
    conn = psycopg2.connect(
        user="postgres",
        password=os.environ.get("POSTGRES_PASS", ""),
        host="localhost",
        port=5432,
        database="venmo",
    )
    return conn


def get_count(query, cn):
    q = "SELECT COUNT(*) FROM " + query
    cur = cn.cursor()
    cur.execute(q)
    return cur.fetchone()[0]

In [None]:
"""
Total Transactions: 135921927
Total Users: 22066565
Total Users w/facebook=true pic: 2479401
Total Users w/peekyou data: 393499
Total Users w/peekyou match: 19721
Total Users w/facebook search results: 22897
Total Users w/facebook profile (verified): 7079
"""
conn = connect()
print("Total Transactions:", get_count("transactions", conn))
print("Total Users:", get_count("users", conn))
print(
    "Total Users w/facebook=true pic:",
    get_count("users WHERE picture_url LIKE '%%facebook=true'", conn),
)
print(
    "Total Users w/peekyou data:",
    get_count("users WHERE peek_you_results is not null", conn),
)
print(
    "Total Users w/peekyou match:",
    get_count(
        "users WHERE peek_you_results is not null and (peek_you_results ->> 'ResultsMatch') != '[]'",
        conn,
    ),
)
print(
    "Total Users w/facebook search results:",
    get_count("users WHERE facebook_results is not null", conn),
)
print(
    "Total Users w/facebook profile (verified):",
    get_count("users WHERE facebook_profile is not null", conn),
)
conn.close()

In [None]:
user_id_to_loc = {}


def save():
    print("Saving checkpoint...")
    with open("user_id_to_loc.pkl", "wb") as f:
        pickle.dump(user_id_to_loc, f)
    with open("geo_cache.pkl", "wb") as f:
        pickle.dump(geo_cache, f)


conn = connect()
with conn.cursor(name="social_media_exploration") as cursor:
    cursor.itersize = 2000
    cursor.execute(
        "SELECT id, facebook_profile FROM users WHERE facebook_profile is not null"
    )
    for i, (id_, facebook_profile) in enumerate(cursor):
        if i % 500 == 0 and i != 0:
            save()
        info = facebook_profile["info"]
        lives_in = [item for item in info if item.startswith("Lives")]
        from_ = [item for item in info if item.startswith("From ")]
        if len(lives_in) > 0:
            loc = lives_in[0].replace("Lives in ", "")
        elif len(from_) > 0:
            loc = from_[0].replace("From  ", "")
        else:
            continue
        coords = find_coords(loc, geo_cache)
        if coords is None:
            continue
        lat, lng = coords
        user_id_to_loc[id_] = (lat, lng, loc)
save()
conn.close()

In [None]:
with open("user_id_to_loc.pkl", "rb") as f:
    user_id_to_loc_saved = pickle.load(f)

world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
contiguous_usa = gpd.read_file(gplt.datasets.get_path("contiguous_usa"))
usa = world[world.name == "United States of America"]

df_data = {"id": [], "lat": [], "lng": [], "loc": []}
for key, (lat, lng, loc) in user_id_to_loc_saved.items():
    df_data["id"].append(key)
    df_data["lat"].append(lat)
    df_data["lng"].append(lng)
    df_data["loc"].append(loc)
df = pd.DataFrame(df_data)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lng, df.lat))

usa_gdf = gdf.loc[gdf.within(usa.iloc[0]["geometry"])].copy()

# 96 % in USA
print(round(len(usa_gdf) / len(gdf) * 100), "% in USA")

In [None]:
ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator(), figsize=(16, 16))
_ = gplt.pointplot(usa_gdf, ax=ax)