In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
ASMAT_PATH="/Users/samir/Dev/projects/ASMAT2"

In [None]:
pwd

In [None]:
from collections import Counter
import lda
from math import log
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import os
import pandas as pd
import pickle
import sys

#add ASMAT toolkit
sys.path.append(ASMAT_PATH)
sys.path.append("..")
from ASMAT import vectorizer, embeddings, features
from ASMAT.toolkit import gensimer

In [None]:
#paths
HOME="/Users/samir/Dev/projects/comment_feedback_aligner/fra/"
FEEDBACK_REQUESTS_PATH = HOME+"DATA/raw/regulations_proposed_rules_feedback.csv"
CIGARRETES_COMMENTS_PATH=HOME+"DATA/raw/cigarettes_regulations.obj"
TOBACCO_COMMENTS_PATH=HOME+"DATA/raw/tobacco_regulations.obj"
WORD2VEC=HOME+"DATA/embeddings/skip_50.txt"
GLOVE=HOME+"DATA/embeddings/glove.42B.300d.txt"

OUTPUT_TXT = HOME+"DATA/processed/txt/"
OUTPUT_PKL = HOME+"DATA/processed/pkl/"
OUTPUT_VECTORS = HOME+"DATA/processed/vectors/"

COMMENTS_PATH=OUTPUT_TXT+"/all_comments.txt"
CORPUS=OUTPUT_TXT+"all_text.txt"
VOCABULARY_PATH=OUTPUT_PKL+"vocabulary.pkl"
IDF_ESTIMATE_PATH=OUTPUT_PKL+"IDF.pkl"


if not os.path.exists(OUTPUT_TXT):
    os.makedirs(OUTPUT_TXT)
if not os.path.exists(OUTPUT_PKL):
    os.makedirs(OUTPUT_PKL)
if not os.path.exists(OUTPUT_VECTORS):
    os.makedirs(OUTPUT_VECTORS)

## Generate Background Corpus

In [None]:
MIN_Q_LEN = 100

stop_wordz = set(stopwords.words('english'))
import string

translator = str.maketrans('', '', string.punctuation)

def preprocess(d):
    d = d.lower()
    d = d.replace("\n", "\t")
    #remove stop words and punctuation
    d = " ".join([w.translate(translator) for w in d.split() if w not in stop_wordz])
    return d
     
def extract_comments(path):
    df = pd.read_json(path)
    #filter for comments
    df = df[df["documentType"] == "Public Submission"] 
    #remove empty comments
    df.dropna(subset=['commentText'], inplace=True)
    #remove entries with attachments
    df = df[df["attachmentCount"] == 0]        
    #remove new lines
    df["commentText"] = df["commentText"]
    df = df[["docketId", "documentId", "commentText"]]
    return df

In [None]:
#extract comments
df_cigs = extract_comments(CIGARRETES_COMMENTS_PATH)
df_tob = extract_comments(TOBACCO_COMMENTS_PATH)
df_cigs.to_csv(COMMENTS_PATH, header=True, mode="w", index=False)
df_tob.to_csv(COMMENTS_PATH, header=False, mode="a", index=False)



In [None]:
#read queries
df = pd.read_csv(FEEDBACK_REQUESTS_PATH)
#extract all the text 
titles = df["docket_title"].values.tolist()
summaries = df["summary"].values.tolist()
requests = df["feedback_asked"].values.tolist()
comments = df_cigs["commentText"].values.tolist() + df_tob["commentText"].values.tolist()
all_data = titles + summaries + requests + comments
#preprocess text
all_text = [preprocess(str(w)) for w in all_data]
with open(CORPUS,"w") as f:
    f.write("\n".join(all_text))


In [None]:
#get vocabulary
MIN_WORD_FREQ=10
vocab = vectorizer.build_vocabulary(all_text, min_freq=MIN_WORD_FREQ)
print("vocabulary size: {}".format(len(vocab)))
#save vocabulary
with open(VOCABULARY_PATH,"wb") as f:
    pickle.dump(vocab,f)

## Prepare Embeddings

In [None]:
#inverse document frequency
def getIDF(N, t):
    return log(float(N)/float(t))

In [None]:
with open(VOCABULARY_PATH,"rb") as f:
    vocab = pickle.load(f)
with open(CORPUS,"r") as f:
    all_text = f.readlines()

In [None]:
#compute document frequencies
all_idxs, _ = vectorizer.docs2idx(all_text, vocab)
ndocs = len(all_idxs)
docfreq = Counter(str(x) for xs in all_idxs for x in set(xs))
#inverse document frequencies
idfs = {w: getIDF(ndocs, docfreq[w]) for w in docfreq}
#get an IDF vector 
idfvec = np.zeros(len(idfs))
for w, v in idfs.items(): idfvec[int(w)] = v
with open(OUTPUT_PKL+"/IDF.pkl","wb") as f:
    pickle.dump(idfvec,f)


In [None]:
#extract word2vec embeddings
embeddings.extract_embeddings(GLOVE, OUTPUT_PKL+"glove.txt", vocab)

In [None]:
#update word2vec embeddings 
VECTOR_DIM=300
NEGATIVE_SAMPLES=10
EPOCHS=5
train_seq = gensimer.Word2VecReader([CORPUS])
w2v = gensimer.get_skipgram(dim=VECTOR_DIM,negative_samples=NEGATIVE_SAMPLES, min_freq=MIN_WORD_FREQ)
w2v_trained = gensimer.train_skipgram(w2v, train_seq, epochs=EPOCHS,
                                      path_out=OUTPUT_PKL+"glove_tuned.txt",
                                      pretrained_weights_path=OUTPUT_PKL+"glove.txt")


In [None]:
#train topic model 
#TODO:use all data
all_idxs, _ = vectorizer.docs2idx(all_text, vocab)
n_topics=100
n_iter=100
X = features.BOW_freq(all_idxs, vocab, sparse=True)
X = X.astype('int32')
topic_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
topic_model.fit(X)
#save model
with open(OUTPUT_PKL+"/lda.pkl","wb") as f:
    pickle.dump([topic_model, vocab], f)

##  Get Vectors

In [None]:
def read_comments(path, docket_id=None):
    df = pd.read_json(path)
    #filter for comments
    df = df[df["documentType"] == "Public Submission"] 
    #remove empty comments
    df.dropna(subset=['commentText'], inplace=True)
    #remove entries with attachments
    df = df[df["attachmentCount"] == 0]        
    if docket_id:
        df = df[df["docketId"] == docket_id]
    comments = []
    for _, comment in df.iterrows():    
        #segment comment into sentences            
        txt = comment["commentText"]        
        sentences = sent_tokenize(txt)
        # print(len(sentences))
        c = [[ comment["docketId"], comment["documentId"], \
            comment["documentId"]+"#C"+str(i), s] \
            for i,s in enumerate(sentences)]
        comments += c
    df = pd.DataFrame(comments,columns=["docketID", "documentID", "sentenceID","text"])
    #preprocess text
    df["text"] = df["text"].map(preprocess)
    return df

def get_BOW(docs, idfs):
    X = features.BOW(docs, idfs, sparse=True)
    return X

def get_TFIDF(docs, idfs):
    X = features.BOW_freq(docs, idfs, sparse=True)
    X*=idfs
    return X

def get_BOE(docs, E, agg):
    X = features.BOE(docs, E, agg)
    return X

def get_topics(docs, vocab, topic_model):
    X = features.BOW_freq(docs, vocab, sparse=True)
    X = X.astype('int32')
    Xt = topic_model.transform(X)
    return Xt

In [None]:
with open(VOCABULARY_PATH,"rb") as f:
    vocab = pickle.load(f)
#read comments
target_docket = "FDA-2014-N-0189"
# target_docket = "NPS-2017-0001"
df_queries = get_queries(FEEDBACK_REQUESTS_PATH,target_docket)
df_comments = get_comments(COMMENTS_PATH,target_docket)
qidxs, _  = vectorizer.docs2idx(df_queries["text"], vocab)
cidxs, _  = vectorizer.docs2idx(df_comments["text"], vocab)
#save queries and comments
df_queries.to_csv(OUTPUT_TXT+"/queries.csv", header=True, index=False)
df_comments.to_csv(OUTPUT_TXT+"/comments.csv", header=True, index=False)

In [None]:
#tf-idf
with open(OUTPUT_PKL+"/IDF.pkl","rb") as f:
    idfvec = pickle.load(f)
queries_tfidf = get_TFIDF(qidxs, idfvec)
print(queries_tfidf.shape)
comments_tfidf = get_TFIDF(cidxs, idfvec)
print(comments_tfidf.shape)
with open(OUTPUT_VECTORS+"vectors_tfidf.pkl", "wb") as f:
    np.save(f,(queries_tfidf, comments_tfidf))

In [None]:
#word2vec
agg="sum"
E, _ = embeddings.read_embeddings(OUTPUT_PKL+"word2vec.txt", vocab)
queries_boe = get_BOE(qidxs, E, agg)
print(queries_boe.shape)
comments_boe = get_BOE(cidxs, E, agg)
print(comments_boe.shape)
with open(OUTPUT_VECTORS+"vectors_boe.pkl", "wb") as f:
    np.save(f,(queries_boe, comments_boe))

In [None]:
#word2vec tuned
agg="sum"
Et, _ = embeddings.read_embeddings(OUTPUT_PKL+"word2vec_tuned.txt", vocab)
queries_boe = get_BOE(qidxs, Et, agg)
print(queries_boe.shape)
comments_boe = get_BOE(cidxs, Et, agg)
print(comments_boe.shape)
with open(OUTPUT_VECTORS+"vectors_boe_tuned.pkl", "wb") as f:
    np.save(f,(queries_boe, comments_boe))

In [None]:
#topics
with open(OUTPUT_PKL+"/lda.pkl","rb") as f:
    topic_model, _ = pickle.load(f)
queries_lda = get_topics(qidxs, vocab, topic_model)
print(queries_lda.shape)
comments_lda = get_topics(cidxs, vocab, topic_model)
print(comments_lda.shape)
with open(output_pkl+"vectors_lda.pkl", "wb") as f:
    np.save(f,(queries_lda, comments_lda))

## Ranking 

In [None]:
def similarity_rank(q, D):
    simz = np.dot(D,q)/(np.linalg.norm(D)*np.linalg.norm(q))
    rank = np.argsort(simz)[::-1]
    ranked_simz = simz[rank]
    return rank, ranked_simz

def similarity_ranks(Q, D, queries, comments, top_k = 5):
    results = []
    for i in range(Q.shape[0]):
        qid = queries.iloc[i]["requestID"]
        r,s = similarity_rank(Q[i], D)
        sentence_ids = comments.iloc[r[:top_k]]["sentenceID"].values.tolist()
        sims = [str(x) for x in s[:top_k].round(5).tolist()]
        results.append([qid]+sentence_ids+sims)

    return results

df_queries = pd.read_csv(OUTPUT_TXT+"/queries.csv")
df_comments = pd.read_csv(OUTPUT_TXT+"/comments.csv")


In [None]:
#TF-IDF
with open(OUTPUT_VECTORS+"vectors_tfidf.pkl", "rb") as f:
    queries_tfidf, comments_tfidf = np.load(f)
results = similarity_ranks(queries_tfidf, comments_tfidf, df_queries, df_comments)
with open(OUTPUT_VECTORS+"rank_tfidf.csv","w") as fo:
    for r in results:
        fo.write(",".join(r)+"\n")

In [None]:
#BOE
with open(OUTPUT_VECTORS+"vectors_boe.pkl", "rb") as f:
    queries_boe, comments_boe = np.load(f)
results = similarity_ranks(queries_boe, comments_boe, df_queries, df_comments)
with open(OUTPUT_VECTORS+"rank_boe.csv","w") as fo:
    for r in results:
        fo.write(",".join(r)+"\n")

In [None]:
#BOE tuned
with open(OUTPUT_VECTORS+"vectors_boe_tuned.pkl", "rb") as f:
    queries_boet, comments_boet = np.load(f)
results = similarity_ranks(queries_boet, comments_boet, df_queries, df_comments)
with open(OUTPUT_VECTORS+"rank_boe_tuned.csv","w") as fo:
    for r in results:
        fo.write(",".join(r)+"\n")

In [None]:
ls

In [None]:
pwd