In [None]:
%%capture
# restart kernel after this
!pip install wget annoy
!pip install -U sentence-transformers

In [None]:
# run this in cli
#pip install gpustat && watch -n 0.1 -c gpustat -cp --color

In [2]:
# for loading in slack data and cleaning
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
from multiprocessing import Pool

# for creating embeddings and index
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
import numpy as np
import subprocess
from sklearn.decomposition import TruncatedSVD

# for downloading/extracting model from s3
import tarfile
import tempfile
from transformers import cached_path

In [3]:
db_string = "postgresql://postgres:postgres@postgres/postgres"
db = create_engine(db_string)

def query_df(line_query, cell_query=None, conn=db):
    if cell_query==None:
      return pd.read_sql(line_query, conn)
    return pd.read_sql(cell_query, conn)

parent_query = 'SELECT * FROM message;'
reply_query = 'SELECT * FROM reply;'

parents = query_df(parent_query)
replies = query_df(reply_query)

df = pd.concat([parents, replies])
df = df[['message_id', 'text']]
assert df.isna().sum().sum() == 0
print(df.shape)

(637568, 2)


In [4]:
def no_whitespace(text):
    for r in (("\t", " "), ("\n", " "), ('"', '')):
        text = text.replace(*r)
    return text

def no_url(text):
    tokens = text.split()
    new = []
    for t in tokens:
        if 'http' in t:
            new.append('<URL>')        
        else:
            new.append(t)
    clean = ' '.join(new)
    return clean

def no_short_reply(text):
    if len(text) < 30:
        text = None
    return text

def cleaner(series):
    series = series.apply(no_whitespace)
    series = series.apply(no_url)
    series = series.apply(no_short_reply)
    return series

def fast_clean(df):
    with Pool(16) as p:
        seq = [df.text]
        listy = p.map(cleaner, seq)
        results = [pd.Series(i) for i in listy]
        clean = results[0]
    return clean

In [5]:
%%time
df['cleaned'] = fast_clean(df)
df = df.dropna()
df = df.reset_index(drop=True)

CPU times: user 599 ms, sys: 377 ms, total: 976 ms
Wall time: 3.77 s


In [6]:
%%time
# Drop questions longer than 510 characters.
# df = df.loc[df['cleaned'].str.len() < 511]

# Shorten messages.
df['cleaned'] = df['cleaned'].str.slice(0,510)

# Reset index.
df = df.reset_index()

# Get a list of all the posts/messages.
corpus = list(df.cleaned)
len(corpus)

CPU times: user 249 ms, sys: 0 ns, total: 249 ms
Wall time: 248 ms


426855

In [287]:
# df.to_csv('cleaned.csv')

In [None]:
# # bert_m_ids = df[['message_id']]
# bert_m_ids.to_csv('bert_m_ids.csv')

In [265]:
# %%time
# tfidf = TfidfVectorizer(stop_words='english')
# vecs = tfidf.fit_transform(df.cleaned)
# vecs.shape

CPU times: user 5.6 s, sys: 23.7 ms, total: 5.62 s
Wall time: 5.64 s


(426855, 88434)

In [266]:
# %%time
# svd = TruncatedSVD(n_components=100)
# reduced = svd.fit_transform(vecs)
# reduced.shape

CPU times: user 29.9 s, sys: 34.5 s, total: 1min 4s
Wall time: 19.2 s


(426855, 100)

In [7]:
# %%time
# url = 'https://model-2.s3.us-east-2.amazonaws.com/distil-bert-SO.tar.gz'

# def download_pretrained_model():
#     """ Download and extract finetuned model from S3 """
#     # most of this func from https://github.com/huggingface/transfer-learning-conv-ai/blob/master/utils.py
#     resolved_archive_file = cached_path(url)
#     tempdir = tempfile.mkdtemp()
#     with tarfile.open(resolved_archive_file, 'r:gz') as archive:
#         archive.extractall(tempdir)
#     return os.path.join(tempdir, 'model')

# embedder = SentenceTransformer(download_pretrained_model())
# embedder.to("cuda");

In [8]:
model_save_path = 'test1'
embedder = SentenceTransformer(model_save_path)
embedder.to("cuda");

In [9]:
%%time
corpus_embeddings = embedder.encode(corpus)
embs = np.asarray(corpus_embeddings)
embs.shape

CPU times: user 8min 46s, sys: 54 s, total: 9min 40s
Wall time: 9min 40s


(426855, 768)

In [271]:
# combined = np.concatenate((embs, reduced),axis=1)

In [272]:
# combined.shape

(426855, 868)

In [10]:
%%time
# num_docs, vec_dim = combined.shape
num_docs, vec_dim = embs.shape

distance_measure_type = 'angular'

indx = AnnoyIndex(vec_dim, distance_measure_type)
for i in range(num_docs):
    indx.add_item(i, embs[i])

num_trees = int(np.log(num_docs).round(0))
# num_trees = 100
print(num_trees)
indx.build(num_trees)
index_name = f'dim{vec_dim}-trees{num_trees}'
index_file = f'{index_name}.ann'
indx.save(index_file)

13
CPU times: user 1min 32s, sys: 1.94 s, total: 1min 34s
Wall time: 1min 34s


True

In [None]:
# # compress index
# compressed_name = f'{index_name}.tar.gz'
# command = f'tar czvf {compressed_name} {index_file}'
# subprocess.check_output(command.split())

In [None]:
# !tar -xzvf dim768-trees13.tar.gz

In [11]:
%%time
index = AnnoyIndex(vec_dim, distance_measure_type)
index.load(index_file)
for i in index.get_nns_by_item(0,10):
    print(i, df.cleaned[i])

0 Will do, Ill post a link for all of us as soon as the BW meeting ends
102751 Will do. Welcome to the gang, Mayankho!
414652 will do. The VA certainly doesn't care
180632 will do! I am about half finished with the styling
87218 will do thanx, once i figure out how LOL
79647 I will, at least, try. :fist_bump:
152115 I will push the code up to the master branch.
89289 I will. I will start as soon as lecture is over.
275686 Will do! NY is definitely on my East Coast hit list :fire:
316628 That will definitely do it! Lol
CPU times: user 742 µs, sys: 34.9 ms, total: 35.7 ms
Wall time: 108 ms


In [20]:
example = ['issue deploying to heroku']
# example = ['how do I deploy a tensorflow model in production?']
# vec = tfidf.transform(example)
# vec = svd.transform(vec)

emb = embedder.encode(example)
emb = np.asarray(emb)

# combined_example = np.concatenate((emb, vec), axis=1)
# combined_example.shape

In [21]:
%%time
for i in index.get_nns_by_vector(emb.ravel(), 20): # Gets the top 5 similar to unseen example embedding
#     print('\n')
    print(i, df.cleaned[i])

# Search query: 'tensorflow in production'
# Cherry-picked result from the above query w/ full sized embeddings:
# '119220 Deploy machine learning models as web servers. <URL>'
# None of the words are from the query, but the result has a similar meaning.

17846 Error with useEffect spamming API.
224706 Error with useEffect spamming API.
76875 conflict in index.css body.less variables.less and login.html
9952 having an issue trying to branch
208726 having an issue trying to branch
266668 uncomment the switch case and default case
43030 having issues with login right now
27512 its in regard to responsive css
159159 Getting fatal error: Permission denied on Git Bash
426419 Getting fatal error: Permission denied on Git Bash
282449 Reusable Error Validation Pop-Up - ISAAC DONE
53042 Having an issue with deploying getting help from a pm
375291 Incognito mode is a helluva tool
36626 oh.... wow. yeah that's an adjustment
273924 Oh No hang in there <@UFGFB8LM9>!!
278541 Oh - it's a cirlcle ci yaml file
183067 getting errors when trying to run the backend server
345055 React Nested Functional Components
109249 uh oh <@U9RFV4CBF> has become self aware
337909 uh oh <@U9RFV4CBF> has become self aware
CPU times: user 9.02 ms, sys: 12.4 ms, total: 21.