In [None]:
%%capture
# restart kernel after this
!pip install wget annoy
!pip install -U sentence-transformers

In [None]:
# run this in cli
#pip install gpustat && watch -n 0.1 -c gpustat -cp --color

In [264]:
# for loading in slack data and cleaning
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
from multiprocessing import Pool

# for creating embeddings and index
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
import numpy as np
import subprocess
from sklearn.decomposition import TruncatedSVD

# for downloading/extracting model from s3
import tarfile
import tempfile
from transformers import cached_path

In [260]:
db_string = "postgresql://postgres:postgres@postgres/postgres"
db = create_engine(db_string)

def query_df(line_query, cell_query=None, conn=db):
    if cell_query==None:
      return pd.read_sql(line_query, conn)
    return pd.read_sql(cell_query, conn)

parent_query = 'SELECT * FROM message;'
reply_query = 'SELECT * FROM reply;'

parents = query_df(parent_query)
replies = query_df(reply_query)

df = pd.concat([parents, replies])
df = df[['message_id', 'text']]
assert df.isna().sum().sum() == 0
print(df.shape)

(637568, 2)


In [261]:
def no_whitespace(text):
    for r in (("\t", " "), ("\n", " "), ('"', '')):
        text = text.replace(*r)
    return text

def no_url(text):
    tokens = text.split()
    new = []
    for t in tokens:
        if 'http' in t:
            new.append('<URL>')        
        else:
            new.append(t)
    clean = ' '.join(new)
    return clean

def no_short_reply(text):
    if len(text) < 30:
        text = None
    return text

def cleaner(series):
    series = series.apply(no_whitespace)
    series = series.apply(no_url)
    series = series.apply(no_short_reply)
    return series

def fast_clean(df):
    with Pool(16) as p:
        seq = [df.text]
        listy = p.map(cleaner, seq)
        results = [pd.Series(i) for i in listy]
        clean = results[0]
    return clean

In [262]:
%%time
df['cleaned'] = fast_clean(df)
df = df.dropna()
df = df.reset_index(drop=True)

CPU times: user 623 ms, sys: 2.17 s, total: 2.79 s
Wall time: 5.84 s


In [263]:
%%time
# Drop questions longer than 510 characters.
# df = df.loc[df['cleaned'].str.len() < 511]

# Shorten messages.
df['cleaned'] = df['cleaned'].str.slice(0,510)

# Reset index.
df = df.reset_index()

# Get a list of all the posts/messages.
corpus = list(df.cleaned)
len(corpus)

CPU times: user 314 ms, sys: 3.7 ms, total: 317 ms
Wall time: 316 ms


426855

In [287]:
df.to_csv('cleaned.csv')

In [None]:
# # bert_m_ids = df[['message_id']]
# bert_m_ids.to_csv('bert_m_ids.csv')

In [265]:
%%time
tfidf = TfidfVectorizer(stop_words='english')
vecs = tfidf.fit_transform(df.cleaned)
vecs.shape

CPU times: user 5.6 s, sys: 23.7 ms, total: 5.62 s
Wall time: 5.64 s


(426855, 88434)

In [266]:
%%time
svd = TruncatedSVD(n_components=100)
reduced = svd.fit_transform(vecs)
reduced.shape

CPU times: user 29.9 s, sys: 34.5 s, total: 1min 4s
Wall time: 19.2 s


(426855, 100)

In [267]:
%%time
url = 'https://model-2.s3.us-east-2.amazonaws.com/distil-bert-SO.tar.gz'

def download_pretrained_model():
    """ Download and extract finetuned model from S3 """
    # most of this func from https://github.com/huggingface/transfer-learning-conv-ai/blob/master/utils.py
    resolved_archive_file = cached_path(url)
    tempdir = tempfile.mkdtemp()
    with tarfile.open(resolved_archive_file, 'r:gz') as archive:
        archive.extractall(tempdir)
    return os.path.join(tempdir, 'model')

embedder = SentenceTransformer(download_pretrained_model())
embedder.to("cuda");

CPU times: user 2.37 s, sys: 405 ms, total: 2.78 s
Wall time: 2.59 s


SentenceTransformer(
  (0): DistilBERT(
    (bert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (dropout): Dropout(p=0.1, inplace=False)
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [268]:
%%time
corpus_embeddings = embedder.encode(corpus)
embs = np.asarray(corpus_embeddings)
embs.shape

CPU times: user 8min 21s, sys: 29.6 s, total: 8min 51s
Wall time: 8min 51s


(426855, 768)

In [271]:
combined = np.concatenate((embs, reduced),axis=1)

In [272]:
combined.shape

(426855, 868)

In [273]:
%%time
num_docs, vec_dim = combined.shape
distance_measure_type = 'angular'

indx = AnnoyIndex(vec_dim, distance_measure_type)
for i in range(num_docs):
    indx.add_item(i, combined[i])

num_trees = int(np.log(num_docs).round(0))
# num_trees = 100
print(num_trees)
indx.build(num_trees)
index_name = f'dim{vec_dim}-trees{num_trees}'
index_file = f'{index_name}.ann'
indx.save(index_file)

13
CPU times: user 1min 6s, sys: 1.21 s, total: 1min 7s
Wall time: 1min 7s


True

In [None]:
# # compress index
# compressed_name = f'{index_name}.tar.gz'
# command = f'tar czvf {compressed_name} {index_file}'
# subprocess.check_output(command.split())

In [None]:
# !tar -xzvf dim768-trees13.tar.gz

In [274]:
%%time
index = AnnoyIndex(vec_dim, distance_measure_type)
index.load(index_file)
for i in index.get_nns_by_item(0,10):
    print(i, df.cleaned[i])

0 Will do, Ill post a link for all of us as soon as the BW meeting ends
388671 They'll post the BW list after the kickoff. This will be the main channel for all things BW. :tada:
170951 I’ll drop a link, join whenever you’re ready!
8320 Hey everyone, I’ll be dropping a link here at 12:15 for a Q and A
160756 err. I'm hanging out and waiting --just lemme know either way and we'll go from there
1633 Okay, I’ll drop a link when I’m on.
194173 Okay, I’ll drop a link when I’m on.
181947 Np! I will DM you the link soon as I have it
40089 I think in that meeting we'll iron out who will do what <URL>
89930 Good morning everyone good luck on the Sprint- I'll just leave this here. <URL>
CPU times: user 13.6 ms, sys: 4.33 ms, total: 18 ms
Wall time: 24.9 ms


In [285]:
example = ['deploy to heroku']
vec = tfidf.transform(example)
vec = svd.transform(vec)

emb = embedder.encode(example)
emb = np.asarray(emb)

combined_example = np.concatenate((emb, vec), axis=1)
combined_example.shape

(1, 868)

In [286]:
%%time
for i in index.get_nns_by_vector(combined_example.ravel(), 20): # Gets the top 5 similar to unseen example embedding
#     print('\n')
    print(i, df.cleaned[i])

# Search query: 'tensorflow in production'
# Cherry-picked result from the above query w/ full sized embeddings:
# '119220 Deploy machine learning models as web servers. <URL>'
# None of the words are from the query, but the result has a similar meaning.

11914 sorry a quick pr for heroku <URL>
53042 Having an issue with deploying getting help from a pm
216306 Wonder why it automatically deployed that project tho so strange
378176 and ensemble it w/ time domain loudness
223192 look at the top portion of the css
292375 :exploding_head: Mind blown! That makes so much sense!!!
417923 READY, TIRED, NERVOUS, EXCITEDDDDD
204000 Eh… Super difficult to explain. Jump on in.
167493 In the PR to the front end that hits the back end
298598 Three cheers for five years? (where the emo kids at)
376059 ~Spinning up a spark cluster is hard (not a direct quote)
325345 Landslide (Fleetwood Mac) <URL>
40243 makes me excited to tackle back end
284801 :boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom::boom:
367313 haha acc hit return without shift
127383 Go Kickstarter Success!!!!!!!!!!!!!!!
334285 Super flexible but not structured
9965