In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
ASMAT_PATH="/Users/samir/Dev/projects/ASMAT2"

In [None]:
pwd

In [None]:
from collections import Counter
import lda
from math import log
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import os
import pandas as pd
import pickle
import sys

#add ASMAT toolkit
sys.path.append(ASMAT_PATH)
sys.path.append("..")
from ASMAT import vectorizer, embeddings, features
from ASMAT.toolkit import gensimer

In [None]:
#paths
HOME="/Users/samir/Dev/projects/comment_feedback_aligner/"
FEEDBACK_REQUESTS_PATH = HOME+"DATA/raw/regulations_proposed_rules_feedback.csv"
COMMENTS_PATH=HOME+"DATA/raw/filtered_final_dockets_ecig.obj"
WORD2VEC=HOME+"DATA/embeddings/skip_50.txt"

OUTPUT_TXT = HOME+"DATA/processed/txt/"
OUTPUT_PKL = HOME+"DATA/processed/pkl/"
OUTPUT_VECTORS = HOME+"DATA/processed/vectors/"

CORPUS=OUTPUT_TXT+"all_text.txt"
VOCABULARY_PATH=OUTPUT_PKL+"vocabulary.pkl"
IDF_ESTIMATE_PATH=OUTPUT_PKL+"IDF.pkl"


if not os.path.exists(OUTPUT_TXT):
    os.makedirs(OUTPUT_TXT)
if not os.path.exists(OUTPUT_PKL):
    os.makedirs(OUTPUT_PKL)
if not os.path.exists(OUTPUT_VECTORS):
    os.makedirs(OUTPUT_VECTORS)

## Generate Background Corpus

In [None]:
MIN_Q_LEN = 100

stop_wordz = set(stopwords.words('english'))
import string

translator = str.maketrans('', '', string.punctuation)

def preprocess(d):
    d = d.lower()
    d = d.replace("\n"," ").replace("_"," ")
    #remove stop words and punctuation
    d = " ".join([w.translate(translator) for w in d.split() if w not in stop_wordz])
    return d
     
def get_queries(path, docket_id=None):
    queries = []   
    df = pd.read_csv(path)
    df.rename(columns={"Unnamed: 0":"ID"},inplace=True)
    for _, row in df.iterrows():
        try:        
            fdb = row["feedback_asked"]
            fdbs = sent_tokenize(fdb)
            docid = row["ID"]
            q = [[docid, x] for x in fdbs]
            queries += q
        except (AttributeError, TypeError):
            # print("ERROR")
            continue
    
    if docket_id:
        queries = [[d,d+"#"+str(i),q,len(q)] for i, (d,q) in enumerate(queries) if d == docket_id]
    else:
        queries = [[d,d+"#"+str(i),q,len(q)] for i, (d,q) in enumerate(queries)]
    df = pd.DataFrame(queries,columns=["docketID", "requestID","text","len"])
    #preprocess text
    df["text"] = df["text"].map(preprocess)
    return df

def get_comments(path, docket_id=None):
    df = pd.read_json(path)
    #filter for comments
    df = df[df["documentType"] == "Public Submission"] 
    #remove entries with attachments
    df = df[df["attachmentCount"] == 0]        
    if docket_id:
        df = df[df["docketId"] == docket_id]
    comments = []
    for _, comment in df.iterrows():    
        #segment comment into sentences            
        sentences = sent_tokenize(comment["commentText"])
        # print(len(sentences))
        c = [[ comment["docketId"], comment["documentId"], \
            comment["documentId"]+"#"+str(i), s] \
            for i,s in enumerate(sentences)]
        comments += c
    df = pd.DataFrame(comments,columns=["docket_id", "documentId", "sentenceId","text"])
    #preprocess text
    df["text"] = df["text"].map(preprocess)
    return df

In [None]:
#read queries
df_queries = get_queries(FEEDBACK_REQUESTS_PATH)
print("queries: {}".format(len(df_queries)))
df_comments = get_comments(COMMENTS_PATH)
print("comments: {}".format(len(df_comments)))

In [None]:
#extract all the text 
#%TODO: get another background corpus?
all_text = list(df_queries["text"] ) + list(df_comments["text"] ) 
with open(CORPUS,"w") as f:
    f.write("\n".join(all_text))

In [None]:
#get vocabulary
vocab = vectorizer.build_vocabulary(all_text, max_words=50000)
print("vocabulary size: {}".format(len(vocab)))
#save vocabulary
with open(VOCABULARY_PATH,"wb") as f:
    pickle.dump(vocab,f)

In [None]:
print(vocab)

## Prepare Embeddings

In [None]:
#inverse document frequency
def getIDF(N, t):
    return log(float(N)/float(t))

In [None]:
with open(VOCABULARY_PATH,"rb") as f:
    vocab = pickle.load(f)
with open(CORPUS,"r") as f:
    all_text = f.readlines()

In [None]:
#compute document frequencies
all_idxs, _ = vectorizer.docs2idx(all_text, vocab)
ndocs = len(all_idxs)
docfreq = Counter(str(x) for xs in all_idxs for x in set(xs))
#inverse document frequencies
idfs = {w: getIDF(ndocs, docfreq[w]) for w in docfreq}
#get an IDF vector 
idfvec = np.zeros(len(idfs))
for w, v in idfs.items(): idfvec[int(w)] = v
with open(OUTPUT_PKL+"/IDF.pkl","wb") as f:
    pickle.dump(idfvec,f)


In [None]:
#extract word2vec embeddings
embeddings.extract_embeddings(WORD2VEC, OUTPUT_PKL+"word2vec.txt", vocab)


In [None]:
#update word2vec embeddings 
train_seq = gensimer.Word2VecReader([CORPUS], max_sent=20000)
w2v = gensimer.get_skipgram(dim=50,negative_samples=5)
w2v_trained = gensimer.train_skipgram(w2v, train_seq, path_out=OUTPUT_PKL+"word2vec_tuned.txt",
                                      pretrained_weights_path=OUTPUT_PKL+"word2vec.txt")


In [None]:
#train topic model 
#TODO:use all data
all_idxs, _ = vectorizer.docs2idx(all_text, vocab)
n_topics=50
n_iter=3
X = features.BOW_freq(all_idxs[:100], vocab,sparse=True)
topic_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X = X.astype('int32')
topic_model.fit(X)
#save model
with open(OUTPUT_PKL+"/lda.pkl","wb") as f:
    pickle.dump([topic_model, vocab], f)

##  Get Vectors

In [None]:
def docidxs(df):
    docs = [[int(x) for x in d.split()] for d in df["idxs"]]
    return docs 

def get_BOW(docs, idfs):
    X = features.BOW(docs, idfs, sparse=True)
    return X

def get_TFIDF(docs, idfs):
    X = features.BOW_freq(docs, idfs, sparse=True)
    X*=idfs
    return X

def get_BOE(docs, E, agg):
    X = features.BOE(docs, E, agg)
    return X

def get_topics(docs, vocab, topic_model):
    X = features.BOW_freq(docs, vocab, sparse=True)
    X = X.astype('int32')
    Xt = topic_model.transform(X)
    return Xt

In [None]:
with open(VOCABULARY_PATH,"rb") as f:
    vocab = pickle.load(f)


In [None]:
#read comments
target_docket = "FDA-2014-N-0189"
# target_docket = "NPS-2017-0001"
df_queries = get_queries(FEEDBACK_REQUESTS_PATH,target_docket)
df_comments = get_comments(COMMENTS_PATH,target_docket)
qidxs, _  = vectorizer.docs2idx(df_queries["text"], vocab)
cidxs, _  = vectorizer.docs2idx(df_comments["text"], vocab)

In [None]:
#tf-idf
with open(OUTPUT_PKL+"/IDF.pkl","rb") as f:
    idfvec = pickle.load(f)
queries_tfidf = get_TFIDF(qidxs, idfvec)
print(queries_tfidf.shape)
comments_tfidf = get_TFIDF(cidxs, idfvec)
print(comments_tfidf.shape)
with open(OUTPUT_VECTORS+"vectors_tfidf.pkl", "wb") as f:
    np.save(f,(queries_tfidf, comments_tfidf))

In [None]:
#word2vec
agg="sum"
E, _ = embeddings.read_embeddings(OUTPUT_PKL+"word2vec.txt", vocab)
queries_boe = get_BOE(qidxs, E, agg)
print(queries_boe.shape)
comments_boe = get_BOE(cidxs, E, agg)
print(comments_boe.shape)
with open(OUTPUT_VECTORS+"vectors_boe.pkl", "wb") as f:
    np.save(f,(queries_boe, comments_boe))

In [None]:
#word2vec tuned
agg="sum"
Et, _ = embeddings.read_embeddings(OUTPUT_PKL+"word2vec_tuned.txt", vocab)
queries_boe = get_BOE(qidxs, Et, agg)
print(queries_boe.shape)
comments_boe = get_BOE(cidxs, Et, agg)
print(comments_boe.shape)
with open(OUTPUT_VECTORS+"vectors_boe_tuned.pkl", "wb") as f:
    np.save(f,(queries_boe, comments_boe))

In [None]:
#topics
with open(OUTPUT_PKL+"/lda.pkl","rb") as f:
    topic_model, _ = pickle.load(f)
queries_lda = get_topics(qidxs, vocab, topic_model)
print(queries_lda.shape)
comments_lda = get_topics(cidxs, vocab, topic_model)
print(comments_lda.shape)
with open(output_pkl+"vectors_lda.pkl", "wb") as f:
    np.save(f,(queries_lda, comments_lda))