In [None]:
%%capture
# restart kernel after this
!pip install wget annoy
!pip install -U sentence-transformers

In [None]:
# run this in cli
#pip install gpustat && watch -n 0.1 -c gpustat -cp --color

In [None]:
# for loading in slack data and cleaning
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
from multiprocessing import Pool

# for creating embeddings and index
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
import numpy as np
import subprocess
from sklearn.decomposition import TruncatedSVD

# for downloading/extracting model from s3
import tarfile
import tempfile
from transformers import cached_path

In [None]:
!pip freeze | grep numpy

In [None]:
db_string = "postgresql://postgres:postgres@postgres/postgres"
db = create_engine(db_string)

def query_df(line_query, cell_query=None, conn=db):
    if cell_query==None:
      return pd.read_sql(line_query, conn)
    return pd.read_sql(cell_query, conn)

parent_query = 'SELECT * FROM message;'
reply_query = 'SELECT * FROM reply;'

parents = query_df(parent_query)
replies = query_df(reply_query)

df = pd.concat([parents, replies])
df = df[['message_id', 'text']]
assert df.isna().sum().sum() == 0
print(df.shape)

In [None]:
def no_whitespace(text):
    for r in (("\t", " "), ("\n", " "), ('"', '')):
        text = text.replace(*r)
    return text

def no_url(text):
    tokens = text.split()
    new = []
    for t in tokens:
        if 'http' in t:
            new.append('<URL>')        
        else:
            new.append(t)
    clean = ' '.join(new)
    return clean

def no_short_reply(text):
    if len(text) < 30:
        text = None
    return text

def cleaner(series):
    series = series.apply(no_whitespace)
    series = series.apply(no_url)
    series = series.apply(no_short_reply)
    return series

def fast_clean(df):
    with Pool(16) as p:
        seq = [df.text]
        listy = p.map(cleaner, seq)
        results = [pd.Series(i) for i in listy]
        clean = results[0]
    return clean

In [None]:
%%time
df['cleaned'] = fast_clean(df)
df = df.dropna()
df = df.reset_index(drop=True)

In [None]:
%%time
# Drop questions longer than 510 characters.
df = df.loc[df['cleaned'].str.len() < 511]

# Reset index.
df = df.reset_index()

# Get a list of all the posts/messages.
corpus = list(df.cleaned)
len(corpus)

In [None]:
%%time
url = 'https://model-2.s3.us-east-2.amazonaws.com/distil-bert-SO.tar.gz'

def download_pretrained_model():
    """ Download and extract finetuned model from S3 """
    # most of this func from https://github.com/huggingface/transfer-learning-conv-ai/blob/master/utils.py
    resolved_archive_file = cached_path(url)
    tempdir = tempfile.mkdtemp()
    with tarfile.open(resolved_archive_file, 'r:gz') as archive:
        archive.extractall(tempdir)
    return os.path.join(tempdir, 'model')

embedder = SentenceTransformer(download_pretrained_model())
embedder.to("cuda")

In [None]:
%%time
corpus_embeddings = embedder.encode(corpus)
embs = np.asarray(corpus_embeddings)
embs.shape

In [None]:
# %%time
# svd = TruncatedSVD(n_components=300)
# reduced = svd.fit_transform(np.array(embs))
# reduced.shape

In [None]:
%%time
num_docs, vec_dim = embs.shape
distance_measure_type = 'angular'

indx = AnnoyIndex(vec_dim, distance_measure_type)
for i in range(num_docs):
    indx.add_item(i, embs[i])

num_trees = int(np.log(num_docs).round(0))
# num_trees = 100
print(num_trees)
indx.build(num_trees)
index_name = f'dim{vec_dim}-trees{num_trees}'
index_file = f'{index_name}.ann'
indx.save(index_file)

In [None]:
# # compress index
# compressed_name = f'{index_name}.tar.gz'
# command = f'tar czvf {compressed_name} {index_file}'
# subprocess.check_output(command.split())

In [None]:
!tar -xzvf dim768-trees13.tar.gz

In [None]:
%%time
index = AnnoyIndex(vec_dim, distance_measure_type)
index.load(index_file)
for i in index.get_nns_by_item(0,10):
    print(i, df.cleaned[i])

In [None]:
example = ['tensorflow in production']
emb = embedder.encode(example)
# emb = svd.transform(np.array(emb))
emb = np.asarray(emb)
emb.shape

In [None]:
%%time
for i in index.get_nns_by_vector(emb.ravel(), 10): # Gets the top 5 similar to unseen example embedding
#     print('\n')
    print(i, df.cleaned[i])

# Search query: 'tensorflow in production'
# Cherry-picked result from the above query w/ full sized embeddings:
# '119220 Deploy machine learning models as web servers. <URL>'
# None of the words are from the query, but the result has a similar meaning.