In [None]:
%%capture
!pip install scikit-learn transformers annoy
!pip install ipywidgets
# !pip install --upgrade jupyter_client

# from ipywidgets import IntProgress

import psycopg2
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from multiprocessing import Pool
from sklearn.decomposition import PCA
from annoy import AnnoyIndex
from tqdm import tqdm
import pickle

db_string = "postgresql://postgres:postgres@postgres/postgres"
db = create_engine(db_string)

def query_df(line_query, cell_query=None, conn=db):
    if cell_query==None:
      return pd.read_sql(line_query, conn)
    return pd.read_sql(cell_query, conn)

# Custom notebook magic commands for loading sql.
from IPython.core.magic import register_line_cell_magic
def create_df_sql_magic(magic_name, conn):
    def sql_df(line_query, cell_query=None, conn=db):
        if cell_query==None:
          return pd.read_sql(line_query, conn)
        return pd.read_sql(cell_query, conn)
    custom_func = sql_df
    custom_func.__name__ = magic_name
    register_line_cell_magic(custom_func)
create_df_sql_magic('sql_df', db)

parent_query = 'SELECT * FROM message;'
reply_query = 'SELECT * FROM reply;'

parents = query_df(parent_query)
replies = query_df(reply_query)

df = pd.concat([parents, replies])
df = df[['message_id', 'text']]
assert df.isna().sum().sum() == 0
print(df.shape)

def no_whitespace(text):
    for r in (("\t", " "), ("\n", " "), ('"', '')):
        text = text.replace(*r)
    return text

def no_short_reply(text):
    if len(text) < 10:
        text = None
    return text

def cleaner(series):
    series = series.apply(no_whitespace)
    series = series.apply(no_short_reply)
    return series

def fast_clean(df):
    with Pool(16) as p:
        seq = [df.text]
        listy = p.map(cleaner, seq)
        results = [pd.Series(i) for i in listy]
        clean = results[0]
    return clean

%%time
df['cleaned'] = fast_clean(df)
df = df.dropna()
df = df.reset_index(drop=True)

import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# model.eval();

# # If you have a GPU, put everything on cuda
# tokens_tensor = tokens_tensor.to('cuda')
# segments_tensors = segments_tensors.to('cuda')
# model.to('cuda')

model.to('cuda');
sample = df.sample(10000)

embeddings = []
for i in sample.cleaned:
    input_ids = torch.tensor(tokenizer.encode([i])).unsqueeze(0)
    input_ids = input_ids.to('cuda')
    outputs = model(input_ids)
    emb = outputs[0]
    np_emb = emb.cpu().detach().numpy()
    embeddings.append(np_emb.flatten())

%%time
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=300)
reduced = svd.fit_transform(np.array(embeddings))
reduced.shape

%%time
num_docs, vec_dim = reduced.shape

indx = AnnoyIndex(vec_dim, 'angular')
for i in range(num_docs):
    indx.add_item(i, reduced[i])

trees = int(np.log(num_docs).round(0))
print(trees)
indx.build(trees)
indx.save('berty.ann')

%%time
index = AnnoyIndex(300, 'angular')
index.load('berty.ann')
for i in index.get_nns_by_item(0,10):
    print(i, sample.cleaned.iloc[i])
    
%%time
example = ['how do I deploy to heroku']

input_ids = torch.tensor(tokenizer.encode(example)).unsqueeze(0)
input_ids = input_ids.to('cuda')
outputs = model(input_ids)
emb = outputs[0]
np_emb = emb.cpu().detach().numpy()
vec = np_emb.flatten()
vec = svd.transform([vec])
vec.shape

%%time
for i in index.get_nns_by_vector(vec.ravel(), 10):
    print(i, df.cleaned.iloc[i])