In [1]:
from matplotlib import pyplot as plt
from collections import Counter, defaultdict
from IPython.display import display, HTML
import seaborn as sns
import pandas as pd
import psycopg2
import requests
import pickle
import random
import os
import re

import warnings

warnings.filterwarnings("ignore")

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
def connect():
    conn = psycopg2.connect(
        user="postgres",
        password=os.environ.get("POSTGRES_PASS", ""),
        host="localhost",
        port=5432,
        database="venmo",
    )
    return conn

<IPython.core.display.Javascript object>

In [3]:
# https://github.com/openeventdata/mordecai
#  $ python -m spacy download en_core_web_lg
#  $ docker pull elasticsearch:5.5.2
#  $ wget https://andrewhalterman.com/files/geonames_index.tar.gz
#  $ docker run -d -p 127.0.0.1:9200:9200 -v $(pwd)/geonames_index/:/usr/share/elasticsearch/data elasticsearch:5.5.2
from mordecai import Geoparser

# https://github.com/somnathrakshit/geograpy3
#  > This wasn't very good.
# import geograpy

ModuleNotFoundError: No module named 'mordecai'

<IPython.core.display.Javascript object>

In [None]:
geo = Geoparser()

In [None]:
mordecai_tokens = Counter()
mordecai_token_examples = defaultdict(list)
meta = Counter()

conn = connect()
with conn.cursor(name="exploration") as cursor:
    cursor.itersize = 2000
    cursor.execute("SELECT * FROM transactions")
    for i, row in enumerate(cursor):
        if i % 10000 == 0 and i != 0:
            # checkpoint
            print("Checkpoint", i)
            with open("mordecai_tokens.pkl", "wb") as f:
                pickle.dump((mordecai_tokens, mordecai_token_examples), f)
            with open("meta.pkl", "wb") as f:
                pickle.dump(meta, f)
        raw_msg = row[1]
        meta["msgs"] += 1
        if row[1] is None:
            continue
        msg = re.sub(r"[^\w\d_\- ]", "", raw_msg).strip()
        if len(msg) == 0:
            continue
        meta["msgs_processed"] += 1

        g = geo.geoparse(msg)
        if len(g) != 0:
            for w in [item["word"] for item in g]:
                mordecai_tokens[w] += 1
                mordecai_token_examples[w].append(raw_msg)

conn.close()

In [None]:
# Checkpoint
with open("mordecai_tokens.pkl", "rb") as f:
    mordecai_tokens_saved, mordecai_token_examples_saved = pickle.load(f)
with open("meta.pkl", "rb") as f:
    meta_saved = pickle.load(f)
meta_saved

In [None]:
# 71%
meta_saved["msgs_processed"] / meta_saved["msgs"]

In [None]:
# 2%
sum(mordecai_tokens_saved.values()) / meta_saved["msgs"]

In [None]:
REPLACE_MORDECAI_TOKENS = {
    "Cali": "California",
    "jersey": "Jersey",
    "baja": "Baja",
    "Vegas": "Las Vegas",
    "vegas": "Las Vegas",
    "VEGAS": "Las Vegas",
    "NYC": "New York City",
    "Nyc": "New York City",
    "LA": "Los Angeles",
    "Philly": "Philadelphia",
    "Nola": "New Orleans",
}
IGNORE_MORDECAI_TOKENS = set(
    [
        "Wendys",
        "Bc",
        "Hookers",
        "foooood",
        "Santa",
        "Easter",
        "Playa",
        "Yeehaw",
        "Zelle",
        "AZ",
        "Tn",
        "Anotha",
        "Rv",
        "RV",
        "FL",
        "NY",
        "Sydney",
        "Turkey",
        "Brittany",
        "Cleaning",
        "Pussy",
        "Daves",
        "Charlotte",
        "NC",
        "Beach",
        "China",
        "Paris",
        "Mexico",
        "Iceland",
        "Cancun",
        "Canada",
        "Thailand",
        "Tokyo",
        "India",
        "Montreal",
        "Oktoberfest",
        "Burma",
        "Nepal",
        "Peru",
        "Colombia",
        "Italy",
        "Bangkok",
        "Moscow",
        "Japan",
        "Greece",
        "Berlin",
        "America",
        "Amsterdam",
        "London",
        "Saigon",
        "Brazil",
        "Barcelona",
        "Spain",
        "Africa Join",
        "Tulum",
        "Kathmandu",
    ]
)
N = 30

# Handle aliases
for repl, token in REPLACE_MORDECAI_TOKENS.items():
    if repl in mordecai_tokens_saved:
        cnt = mordecai_tokens_saved[repl]
        del mordecai_tokens_saved[repl]
        mordecai_tokens_saved[token] += cnt

tokens = []
counts = []
for token, cnt in mordecai_tokens_saved.most_common(N * 2):
    if token in IGNORE_MORDECAI_TOKENS:
        continue
    tokens.append(token)
    counts.append(cnt)
    if len(tokens) == N:
        break
plt.figure(figsize=(8, N // 5))
sns.barplot(y=tokens, x=counts, orient="h").set_title(
    "Mordecai Tokens".format(meta_saved["msgs"])
)
# Tokens for other countries often used as adjective, "China virus" "China town" "China food"

In [None]:
df_data = {"token": [], "examples": []}
for token in tokens[:5]:
    sample_usage = random.sample(mordecai_token_examples_saved[token], 5)
    df_data["token"].append(token)
    df_data["examples"].append("\n".join(sample_usage))
df = pd.DataFrame(df_data)
df.set_index("token")


def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n", "<br>")))


pretty_print(df)