In [None]:
%%capture
!pip install scikit-learn
!pip install gensim
!pip install annoy

In [None]:
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from annoy import AnnoyIndex
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
db_string = "postgresql://postgres:postgres@postgres/postgres"
# db_string = "postgresql://postgres:postgres@postgres/dev4slack"
db = create_engine(db_string)

def query_df(line_query, cell_query=None, conn=db):
    if cell_query==None:
      return pd.read_sql(line_query, conn)
    return pd.read_sql(cell_query, conn)

# Custom notebook magic commands for loading sql.
from IPython.core.magic import register_line_cell_magic
def create_df_sql_magic(magic_name, conn):
    def sql_df(line_query, cell_query=None, conn=db):
        if cell_query==None:
          return pd.read_sql(line_query, conn)
        return pd.read_sql(cell_query, conn)
    custom_func = sql_df
    custom_func.__name__ = magic_name
    register_line_cell_magic(custom_func)
create_df_sql_magic('sql_df', db)

In [None]:
query = \
'''
SELECT 
    message.text AS p, message.reply_count, message.user_id as p_id, message.ts,
    reply.text AS c, reply.user_id as c_id
FROM message
LEFT JOIN reply on reply.thread_ts=message.ts
WHERE message.channel_id='CFBBHV7AT' AND message.reply_count > 0
ORDER BY message.ts, reply.ts;
'''
df = query_df(query)
df.shape

In [None]:
df = df[['p', 'c']]
df = pd.DataFrame(pd.concat([df.p, df.c]))
df.columns = ['text']

df = df.drop_duplicates()
df = df.dropna()
assert df.isna().sum().sum() == 0
df.shape

In [None]:
def no_whitespace(text):
    for r in (("\t", " "), ("\n", " "), ('"', '')):
        text = text.replace(*r)
    return text

def no_url(text):
    tokens = text.split()
    new = []
    for t in tokens:
        if 'http' in t:
            new.append('<URL>')
        elif '<@' in t:
            new.append('<USER>')
        else:
            new.append(t)
    clean = ' '.join(new)
    return clean

def no_short_reply(text):
    if len(text) < 10:
        text = None
    return text

def cleaner(series):
    series = series.apply(no_whitespace)
    series = series.apply(no_url)
    series = series.apply(no_short_reply)
    return series

def fast_clean(df):
  # requires df to have columns 'p' and 'r' for parent and reply
    with Pool(16) as p:
        seq = [df[col] for col in list(df)]
        listy = p.map(cleaner, seq)
        results = [pd.Series(i) for i in listy]
        clean = pd.concat(results, axis=1)
        clean = clean.dropna()
#         clean = clean[clean.p != clean.r]
    return clean

In [None]:
df = fast_clean(df)
print(df.shape)
df.head()

In [None]:
%%time
# Document retrieval w/ Tfidf vecs and cosine similarity

v = TfidfVectorizer(stop_words='english')
vecs = v.fit_transform(df.text)
X = pd.DataFrame(vecs.todense(), columns=v.get_feature_names())

pca = PCA(n_components=100)
reduced = pca.fit_transform(vecs.todense())

In [None]:
%%time
example = ['what is the best way to deploy on heroku']
ex_vec = v.transform(example)
ex_vex = pca.transform(ex_vec.todense())
cosine_similarities = ex_vec.dot(X.T)
found = df.iloc[cosine_similarities.argmax()]
print(found[0])

In [None]:
%%time
# Doc retrieval w/ gensim doc2vec * tfidf. Approximate nearest neighbors w/ annoy

num_cores = cpu_count()
corpus = list(df.text)
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
model = Doc2Vec(vector_size=100, workers=num_cores, epochs=10)

model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
import math

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

sigmoid_v = np.vectorize(sigmoid)

In [None]:
%%time
embeds = model.docvecs.vectors_docs

r = sigmoid_v(reduced)
e = sigmoid_v(embeds)

combined = r * e
num_docs, vec_dim = combined.shape

indx = AnnoyIndex(vec_dim, 'angular')  #Length of item vector that will be indexed
for i in range(num_docs):
    indx.add_item(i, tfidf_times_embeds[i])

trees = int(np.log(num_docs).round(0)) # just a rule of thumb
print(trees)
indx.build(trees)
indx.save('a.ann')

In [None]:
combined.min(), combined.max()

In [None]:
%%time
index = AnnoyIndex(100, 'angular')
index.load('a.ann')
for i in index.get_nns_by_item(0,10): # Gets the top 10 similar to embedding @ index 0, including 0
    print(df.text.iloc[i])

In [None]:
%%time
example = ['what is the best way to deploy on heroku']
embedding_vec = sigmoid_v(model.infer_vector(example))

tfidf_vec = v.transform(example)
reduced_vec = sigmoid_v(pca.transform(tfidf_vec.todense()))

ex_vec = (reduced_vec * embedding_vec).ravel()
ex_vec.shape

In [None]:
%%time
for i in index.get_nns_by_vector(ex_vec, 5): # Gets the top 5 similar to unseen example embedding
    print('\n')
    print(i, df.text.iloc[i])