In [None]:
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from multiprocessing import Pool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from annoy import AnnoyIndex
import pickle

db_string = "postgresql://postgres:postgres@postgres/postgres"
db = create_engine(db_string)

def query_df(line_query, cell_query=None, conn=db):
    if cell_query==None:
      return pd.read_sql(line_query, conn)
    return pd.read_sql(cell_query, conn)

# Custom notebook magic commands for loading sql.
from IPython.core.magic import register_line_cell_magic
def create_df_sql_magic(magic_name, conn):
    def sql_df(line_query, cell_query=None, conn=db):
        if cell_query==None:
          return pd.read_sql(line_query, conn)
        return pd.read_sql(cell_query, conn)
    custom_func = sql_df
    custom_func.__name__ = magic_name
    register_line_cell_magic(custom_func)
create_df_sql_magic('sql_df', db)

parent_query = 'SELECT * FROM message;'
reply_query = 'SELECT * FROM reply;'

parents = query_df(parent_query)
replies = query_df(reply_query)

df = pd.concat([parents, replies])
df = df[['message_id', 'text']]
assert df.isna().sum().sum() == 0
print(df.shape)

In [None]:
def no_whitespace(text):
    for r in (("\t", " "), ("\n", " "), ('"', '')):
        text = text.replace(*r)
    return text

def no_short_reply(text):
    if len(text) < 10:
        text = None
    return text

def cleaner(series):
    series = series.apply(no_whitespace)
    series = series.apply(no_short_reply)
    return series

def fast_clean(df):
    with Pool(16) as p:
        seq = [df.text]
        listy = p.map(cleaner, seq)
        results = [pd.Series(i) for i in listy]
        clean = results[0]
    return clean

In [None]:
%%time
df['cleaned'] = fast_clean(df)
df = df.dropna()
df = df.reset_index(drop=True)

In [None]:
%%time
tfidf = TfidfVectorizer(stop_words='english')
vecs = tfidf.fit_transform(df.cleaned)
vecs.shape

In [None]:
%%time
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
reduced = svd.fit_transform(vecs)
reduced.shape

In [None]:
%%time
num_docs, vec_dim = reduced.shape

indx = AnnoyIndex(vec_dim, 'angular')
for i in range(num_docs):
    indx.add_item(i, reduced[i])

trees = int(np.log(num_docs).round(0))
print(trees)
indx.build(trees)
indx.save('annoy.ann')

In [None]:
%%time
index = AnnoyIndex(100, 'angular')
index.load('annoy.ann')
for i in index.get_nns_by_item(0,10):
    print(i, df.cleaned[i])

In [None]:
%%time
example = ['heroku']
vec = tfidf.transform(example)
vec = svd.transform(vec)
vec.shape

In [None]:
%%time
for i in index.get_nns_by_vector(vec.ravel(), 10): # Gets the top 5 similar to unseen example embedding
#     print('\n')
    print(i, df.cleaned[i])

In [None]:
df = df[['message_id']]
df.to_csv('message_ids.csv')

In [None]:
%%writefile insert.py
import pandas as pd
from tqdm import tqdm
import psycopg2
from sqlalchemy import create_engine

db_string = "postgresql://postgres:postgres@postgres/postgres"
db = create_engine(db_string)

df = pd.read_csv('cleaned.csv')

conn = db.raw_connection()
cur = conn.cursor()

for _, row in tqdm(df.iterrows()):
    cur.execute('INSERT INTO cleaned VALUES(%s, %s)', (row['message_id'], row['cleaned']))
conn.commit()
conn.close()

In [None]:
!ls -ahl a.ann

In [None]:
pickle.dump(tfidf, open("tfidf.pkl", "wb"))
pickle.dump(svd, open("svd.pkl", "wb"))

In [None]:
import os

target = 'tfidf.pkl'

if os.path.getsize(target) > 0:      
    with open(target, "rb") as f:
        unpickler = pickle.Unpickler(f)
        test = unpickler.load()

In [None]:
vec = test.transform(['blah blah'])

In [None]:
vec.shape

In [None]:
np.savetxt(r'cleaned.txt', df.values, fmt='%s')