In [None]:
%%capture
# restart kernel after this
!pip install wget annoy
!pip install -U sentence-transformers

In [None]:
# run this in cli
#pip install gpustat && watch -n 0.1 -c gpustat -cp --color

In [289]:
# for loading in slack data and cleaning
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
from multiprocessing import Pool

# for creating embeddings and index
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
import numpy as np
import subprocess
from sklearn.decomposition import TruncatedSVD

# for downloading/extracting model from s3
import tarfile
import tempfile
from transformers import cached_path

In [290]:
db_string = "postgresql://postgres:postgres@postgres/postgres"
db = create_engine(db_string)

def query_df(line_query, cell_query=None, conn=db):
    if cell_query==None:
      return pd.read_sql(line_query, conn)
    return pd.read_sql(cell_query, conn)

parent_query = 'SELECT * FROM message;'
reply_query = 'SELECT * FROM reply;'

parents = query_df(parent_query)
replies = query_df(reply_query)

df = pd.concat([parents, replies])
df = df[['message_id', 'text']]
assert df.isna().sum().sum() == 0
print(df.shape)

(637568, 2)


In [291]:
def no_whitespace(text):
    for r in (("\t", " "), ("\n", " "), ('"', '')):
        text = text.replace(*r)
    return text

def no_url(text):
    tokens = text.split()
    new = []
    for t in tokens:
        if 'http' in t:
            new.append('<URL>')        
        else:
            new.append(t)
    clean = ' '.join(new)
    return clean

def no_short_reply(text):
    if len(text) < 30:
        text = None
    return text

def cleaner(series):
    series = series.apply(no_whitespace)
    series = series.apply(no_url)
    series = series.apply(no_short_reply)
    return series

def fast_clean(df):
    with Pool(16) as p:
        seq = [df.text]
        listy = p.map(cleaner, seq)
        results = [pd.Series(i) for i in listy]
        clean = results[0]
    return clean

In [292]:
%%time
df['cleaned'] = fast_clean(df)
df = df.dropna()
df = df.reset_index(drop=True)

CPU times: user 574 ms, sys: 4.06 s, total: 4.64 s
Wall time: 8.37 s


In [293]:
%%time
# Drop questions longer than 510 characters.
# df = df.loc[df['cleaned'].str.len() < 511]

# Shorten messages.
df['cleaned'] = df['cleaned'].str.slice(0,510)

# Reset index.
df = df.reset_index()

# Get a list of all the posts/messages.
corpus = list(df.cleaned)
len(corpus)

CPU times: user 289 ms, sys: 94.5 ms, total: 384 ms
Wall time: 385 ms


426855

In [287]:
df.to_csv('cleaned.csv')

In [None]:
# # bert_m_ids = df[['message_id']]
# bert_m_ids.to_csv('bert_m_ids.csv')

In [265]:
%%time
tfidf = TfidfVectorizer(stop_words='english')
vecs = tfidf.fit_transform(df.cleaned)
vecs.shape

CPU times: user 5.6 s, sys: 23.7 ms, total: 5.62 s
Wall time: 5.64 s


(426855, 88434)

In [266]:
%%time
svd = TruncatedSVD(n_components=100)
reduced = svd.fit_transform(vecs)
reduced.shape

CPU times: user 29.9 s, sys: 34.5 s, total: 1min 4s
Wall time: 19.2 s


(426855, 100)

In [267]:
%%time
url = 'https://model-2.s3.us-east-2.amazonaws.com/distil-bert-SO.tar.gz'

def download_pretrained_model():
    """ Download and extract finetuned model from S3 """
    # most of this func from https://github.com/huggingface/transfer-learning-conv-ai/blob/master/utils.py
    resolved_archive_file = cached_path(url)
    tempdir = tempfile.mkdtemp()
    with tarfile.open(resolved_archive_file, 'r:gz') as archive:
        archive.extractall(tempdir)
    return os.path.join(tempdir, 'model')

embedder = SentenceTransformer(download_pretrained_model())
embedder.to("cuda");

CPU times: user 2.37 s, sys: 405 ms, total: 2.78 s
Wall time: 2.59 s


SentenceTransformer(
  (0): DistilBERT(
    (bert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (dropout): Dropout(p=0.1, inplace=False)
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [288]:
model_save_path = 'test1'
embedder = SentenceTransformer(model_save_path)
embedder.to("cuda");

In [294]:
%%time
corpus_embeddings = embedder.encode(corpus)
embs = np.asarray(corpus_embeddings)
embs.shape

CPU times: user 8min 39s, sys: 40 s, total: 9min 19s
Wall time: 9min 20s


(426855, 768)

In [271]:
# combined = np.concatenate((embs, reduced),axis=1)

In [272]:
# combined.shape

(426855, 868)

In [295]:
%%time
# num_docs, vec_dim = combined.shape
num_docs, vec_dim = embs.shape

distance_measure_type = 'angular'

indx = AnnoyIndex(vec_dim, distance_measure_type)
for i in range(num_docs):
    indx.add_item(i, embs[i])

num_trees = int(np.log(num_docs).round(0))
# num_trees = 100
print(num_trees)
indx.build(num_trees)
index_name = f'dim{vec_dim}-trees{num_trees}'
index_file = f'{index_name}.ann'
indx.save(index_file)

13
CPU times: user 1min, sys: 2.84 s, total: 1min 3s
Wall time: 1min 3s


True

In [None]:
# # compress index
# compressed_name = f'{index_name}.tar.gz'
# command = f'tar czvf {compressed_name} {index_file}'
# subprocess.check_output(command.split())

In [None]:
# !tar -xzvf dim768-trees13.tar.gz

In [296]:
%%time
index = AnnoyIndex(vec_dim, distance_measure_type)
index.load(index_file)
for i in index.get_nns_by_item(0,10):
    print(i, df.cleaned[i])

0 Will do, Ill post a link for all of us as soon as the BW meeting ends
97231 Ill will pull now as see if it still fits well
180632 will do! I am about half finished with the styling
363171 Will do! She has the pix on her phone so I'll leave that post to her and chime in when necessary
196180 I still dont elijah had to go do something i dont know if he can
161888 Be right there, had to vacate my apt for a few minutes
152628 im going to let yall keep working , we can do our final stand up tomorrow
66120 gonna use it to badger my players to start up again
103943 I'm fine with that approach, we'd just have to work out scheduling
27369 Hope everyone had a good build week and JS assessment. Almost the weekend! <URL>
CPU times: user 16.2 ms, sys: 31.7 ms, total: 47.9 ms
Wall time: 45.6 ms


In [297]:
example = ['deploy to heroku']
# vec = tfidf.transform(example)
# vec = svd.transform(vec)

emb = embedder.encode(example)
emb = np.asarray(emb)

# combined_example = np.concatenate((emb, vec), axis=1)
# combined_example.shape

In [298]:
%%time
for i in index.get_nns_by_vector(emb.ravel(), 20): # Gets the top 5 similar to unseen example embedding
#     print('\n')
    print(i, df.cleaned[i])

# Search query: 'tensorflow in production'
# Cherry-picked result from the above query w/ full sized embeddings:
# '119220 Deploy machine learning models as web servers. <URL>'
# None of the words are from the query, but the result has a similar meaning.

74454 landing page connected -&gt; <URL>
231048 establish dependencies being used on monday
178635 hover over the lower left side of the zoom window
129119 Landing page - <URL> Front-end - <URL>
391514 Landing page - <URL> Front-end - <URL>
377151 ensure only natural numbers and not floats?
285580 Buttons, that’s a good idea, in a pinch
279620 Moving the function inside cleared my error
146177 Backend incoming guys doing some final testing
410958 Backend incoming guys doing some final testing
54888 landing-page , login-screen, signup-form
143625 scores, category, random questions,
60483 Calculating integer factorials in constant time, taking advantage of overflow behavior <URL>
66219 Material Design Bootstrap …and styled-components
66211 Material UI comes with some PRO components
276645 scoring on the thousandth % range?? haha
72898 Technical Design Document <URL>
225262 yarn add node-sass and still getting error
17948 yarn add node-sass and still getting error
338129 250 live weight s