### Imports and initial config

In [None]:
import numpy as np
from nltk.corpus import wordnet as wn
from stanfordcorenlp import StanfordCoreNLP
import re
import bisect
from collections import defaultdict
import ast
import os
from gutenberg.cleanup import strip_headers
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
import math
import gensim
import pickle
from scipy import spatial
from nltk.tree import *
import nltk.corpus
import nltk.tokenize.punkt
import nltk.stem.snowball
import string

In [None]:
public='/home/users2/mehrotsh/scripts/packages/stanford-corenlp-full-2018-02-27/'
personal='/home/samarth/stanford-corenlp-full-2018-02-27/'

In [52]:
nlp = StanfordCoreNLP(public)

### Useful Functions

Tree builder

In [None]:
def tree(): 
    return defaultdict(tree)


def _leadingSpaces_(target):
    return len(target) - len(target.lstrip())

def _findParent_(curIndent, parid, treeRef):
    tmpid = parid
    while (curIndent <= treeRef[tmpid]['indent']):
        tmpid = treeRef[tmpid]['parid']
    return tmpid


def generateTree(rawTokens, treeRef):

    # (token
    REGEX_OPEN = r"^\s*\(([a-zA-Z0-9_']*)\s*$"
    # (token (tok1 tok2) (tok3 tok4) .... (tokx toky))
    REGEX_COMP = r"^\s*\(([a-zA-Z0-9_']+)\s*((?:[(]([a-zA-Z0-9_;.,?'!]+)\s*([a-zA-Z0-9_;\.,?!']+)[)]\s*)+)"    
    # (, ,) as stand-alone. Used for match() not search()
    REGEX_PUNC = r"^\s*\([,!?.'\"]\s*[,!?.'\"]\)"
    # (tok1 tok2) as stand-alone
    REGEX_SOLO_PAIR = r"^\s*\(([a-zA-Z0-9_']+)\s*([a-zA-Z0-9_']+)\)"
    # (tok1 tok2) used in search()
    REGEX_ISOL_IN_COMP = r"\(([a-zA-Z0-9_;.,?!']+)\s*([a-zA-Z0-9_;.,?!']+)\)"
    # (punc punc) used in search()
    REGEX_PUNC_SOLO = r"\([,!?.'\"]\s*[,!?.'\"]\)"
   
    treeRef[len(treeRef)] = {'curid':0, 
                             'parid':-1, 
                             'posOrTok':'ROOT', 
                             'indent':0,
                            'children':[],
                            'childrenTok':[]}
    ID_CTR = 1
    
    for tok in rawTokens[1:]:
        
        curIndent = _leadingSpaces_(tok) 
        parid = _findParent_(curIndent, ID_CTR-1, treeRef)
        
        # CHECK FOR COMPOSITE TOKENS
        checkChild = re.match(REGEX_COMP, tok)
        if (checkChild):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkChild.group(1), 
                               'indent':curIndent,
                              'children':[],
                              'childrenTok':[]}
            upCTR = ID_CTR
            ID_CTR += 1
            
            subCheck = re.sub(REGEX_PUNC_SOLO,'',checkChild.group(2))
            subs = re.findall(REGEX_ISOL_IN_COMP, subCheck) 
            for ch in subs:
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':upCTR, 
                                   'posOrTok':ch[0], 
                                   'indent':curIndent+2,
                                  'children':[],
                                  'childrenTok':[]}
                ID_CTR += 1
                treeRef[ID_CTR] = {'curid':ID_CTR, 
                                   'parid':ID_CTR-1, 
                                   'posOrTok':ch[1], 
                                   'indent':curIndent+2,
                                  'children':[],
                                  'childrenTok':[]}
                ID_CTR += 1
            continue
           

            
        checkSingle = re.match(REGEX_SOLO_PAIR, tok)
        if (checkSingle):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkSingle.group(1), 
                               'indent':curIndent+2,
                              'children':[],
                              'childrenTok':[]}
            ID_CTR += 1
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':ID_CTR-1, 
                               'posOrTok':checkSingle.group(2), 
                               'indent':curIndent+2,
                              'children':[],
                              'childrenTok':[]}
            ID_CTR += 1
            continue
        
        
        checkPunc = re.match(REGEX_PUNC, tok)
        if (checkPunc): # ignore punctuation
            continue

        checkMatch = re.match(REGEX_OPEN, tok)
        if (checkMatch):
            treeRef[ID_CTR] = {'curid':ID_CTR, 
                               'parid':parid, 
                               'posOrTok':checkMatch.group(1), 
                               'indent':curIndent,
                              'children':[],
                              'childrenTok':[]}
            ID_CTR += 1
            continue

    return
            

def flipTree(treeRef):
    # Pass 1 fill in children
    for k,v in treeRef.items():
        if (k > 0):
            bisect.insort(treeRef[v['parid']]['children'], k)
    # Pass 2 map children to tokens
    for k,v in treeRef.items():
        if (k > 0):
            treeRef[k]['childrenTok'] = [treeRef[ch]['posOrTok'] for ch in treeRef[k]['children']]
    treeRef[0]['childrenTok'] = treeRef[1]['posOrTok']


Kernel methods

In [None]:
def _isLeaf_(tree, parentNode):
    return (len(tree[parentNode]['children']) == 0)

def _isPreterminal_(tree, parentNode):
    for idx in tree[parentNode]['children']:
        if not _isLeaf_(tree, idx):
            return False
    return True

'''
Implementation of the Colins-Duffy or Subset-Tree (SST) Kernel
'''

def _cdHelper_(tree1, tree2, node1, node2, store, lam, SST_ON):
    # No duplicate computations
    if store[node1, node2] >= 0:
        return

    # Leaves yield similarity score by definition
    if (_isLeaf_(tree1, node1) or _isLeaf_(tree2, node2)):
        store[node1, node2] = 0
        return

    # same parent node
    if tree1[node1]['posOrTok'] == tree2[node2]['posOrTok']:
        # same children tokens
        if tree1[node1]['childrenTok'] == tree2[node2]['childrenTok']:
            # Check if both nodes are pre-terminal
            if _isPreterminal_(tree1, node1) and _isPreterminal_(tree2, node2):
                store[node1, node2] = lam
                return
            # Not pre-terminal. Recurse among the children of both token trees.
            else:
                nChildren = len(tree1[node1]['children'])

                runningTotal = None
                for idx in range(nChildren):
                     # index ->  node_id
                    tmp_n1 = tree1[node1]['children'][idx]
                    tmp_n2 = tree2[node2]['children'][idx]
                    # Recursively run helper
                    _cdHelper_(tree1, tree2, tmp_n1, tmp_n2, store, lam, SST_ON)
                    # Set the initial value for the layer. Else multiplicative product.
                    if (runningTotal == None):
                        runningTotal = SST_ON + store[tmp_n1, tmp_n2]
                    else:
                        runningTotal *= (SST_ON + store[tmp_n1, tmp_n2])

                store[node1, node2] = lam * runningTotal
                return
        else:
            store[node1, node2] = 0
    else: # parent nodes are different
        store[node1, node2] = 0
        return


def _cdKernel_(tree1, tree2, lam, SST_ON):
    # Fill the initial state of the store
    store = np.empty((len(tree1), len(tree2)))
    store.fill(-1)
    # O(N^2) to compute the tree dot product
    for i in range(len(tree1)):
        for j in range(len(tree2)):
            _cdHelper_(tree1, tree2, i, j, store, lam, SST_ON)

    return store.sum()

'''
Returns a tuple w/ format: (raw, normalized)
If NORMALIZE_FLAG set to False, tuple[1] = -1
'''
def CollinsDuffy(tree1, tree2, lam, NORMALIZE_FLAG, SST_ON):
    raw_score = _cdKernel_(tree1, tree2, lam, SST_ON)
    if (NORMALIZE_FLAG):
        t1_score = _cdKernel_(tree1, tree1, lam, SST_ON)
        t2_score = _cdKernel_(tree2, tree2, lam, SST_ON)
        return (raw_score,(raw_score / math.sqrt(t1_score * t2_score)))
    else:
        return (raw_score,-1)



'''
Implementation of the Partial Tree (PT) Kernel from:
"Efficient Convolution Kernels for Dependency and Constituent Syntactic Trees"
by Alessandro Moschitti
'''

'''
The delta function is stolen from the Collins-Duffy kernel
'''

def _deltaP_(tree1, tree2, seq1, seq2, store, lam, mu, p):

#     # Enumerate subsequences of length p+1 for each child set
    if p == 0:
        return 0
    else:
        # generate delta(a,b)
        _delta_(tree1, tree2, seq1[-1], seq2[-1], store, lam, mu)
        if store[seq1[-1], seq2[-1]] == 0:
            return 0
        else:
            runningTot = 0
            for i in range(p-1, len(seq1)-1):
                for r in range(p-1, len(seq2)-1):
                    scaleFactor = pow(lam, len(seq1[:-1])-i+len(seq2[:-1])-r)
                    dp = _deltaP_(tree1, tree2, seq1[:i], seq2[:r], store, lam, mu, p-1)
                    runningTot += (scaleFactor * dp)
            return runningTot

def _delta_(tree1, tree2, node1, node2, store, lam, mu):

    # No duplicate computations
    if store[node1, node2] >= 0:
        return

    # Leaves yield similarity score by definition
    if (_isLeaf_(tree1, node1) or _isLeaf_(tree2, node2)):
        store[node1, node2] = 0
        return

    # same parent node
    if tree1[node1]['posOrTok'] == tree2[node2]['posOrTok']:

        if _isPreterminal_(tree1, node1) and _isPreterminal_(tree2, node2):
            if tree1[node1]['childrenTok'] == tree2[node2]['childrenTok']:
                store[node1, node2] = lam
            else:
                store[node1, node2] = 0
            return

        else:
            # establishes p_max
            childmin = min(len(tree1[node1]['children']), len(tree2[node2]['children']))
            deltaTot = 0
            for p in range(1,childmin+1):
                # compute delta_p
                deltaTot += _deltaP_(tree1, tree2,
                                     tree1[node1]['children'],
                                     tree2[node2]['children'], store, lam, mu, p)

            store[node1, node2] = mu * (pow(lam,2) + deltaTot)
            return

    else:
        # parent nodes are different
        store[node1, node2] = 0
        return

def _ptKernel_(tree1, tree2, lam, mu):
    # Fill the initial state of the store
    store = np.empty((len(tree1), len(tree2)))
    store.fill(-1)

    # O(N^2) to compute the tree dot product
    for i in range(len(tree1)):
        for j in range(len(tree2)):
            _delta_(tree1, tree2, i, j, store, lam, mu)

    return store.sum()

'''
Returns a tuple w/ format: (raw, normalized)
If NORMALIZE_FLAG set to False, tuple[1] = -1
'''
def MoschittiPT(tree1, tree2, lam, mu, NORMALIZE_FLAG):
    raw_score = _ptKernel_(tree1, tree2, lam, mu)
    if (NORMALIZE_FLAG):
        t1_score = _ptKernel_(tree1, tree1, lam, mu)
        t2_score = _ptKernel_(tree2, tree2, lam, mu)
        return (raw_score,(raw_score / math.sqrt(t1_score * t2_score)))
    else:
        return (raw_score,-1)

In [None]:
def getNLPToks(rawSentence):
    output = nlp.annotate(rawSentence, properties={'annotators': 'tokenize,ssplit,pos,parse','outputFormat': 'json','timeout':'50000'})
    output=ast.literal_eval(output)
    tokens = output['sentences'][0]['tokens']
    parse = output['sentences'][0]['parse'].split("\n")
    return {
        'toks':tokens, 'parse':parse
    }

In [None]:
def avg_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [None]:
def getDuffyScore(sent1,sent2):
    tree_1=tree()
    tree_2=tree()
    out1=getNLPToks(sent1)
    out2=getNLPToks(sent2)
    generateTree(out1['parse'],tree_1)
    generateTree(out2['parse'],tree_2)
    flipTree(tree_1)
    flipTree(tree_2)
    (rscore_st, nscore_st) = CollinsDuffy(tree_1, tree_2, 0.8, 1, 1)
    return rscore_st,nscore_st

In [None]:
def getMoschittiScore(sent1,sent2):
    tree_1=tree()
    tree_2=tree()
    out1=getNLPToks(sent1)
    out2=getNLPToks(sent2)
    generateTree(out1['parse'],tree_1)
    generateTree(out2['parse'],tree_2)
    flipTree(tree_1)
    flipTree(tree_2)
    (rscore_st, nscore_st) = MoschittiPT(tree_1, tree_2, 0.8, 1, 1)
#     return rscore_st,nscore_st
    return nscore_st

### Testing on Project Gutenberg samples

Creating parse trees for the new text

In [None]:
test="./new/pierre.txt"
testB=open(test)
raw=testB.read()
text = strip_headers(raw).strip()
text=text.replace('\n',' ')
text=sent_tokenize(text)
text = list(filter(lambda x: len(x)>1, text))

In [None]:
len(text)

In [None]:
i=0
parseTrees=list()
for sent in text:
    print(i)
    sentParse=getNLPToks(sent)
    tempTree=tree()
    generateTree(sentParse['parse'],tempTree)
    flipTree(tempTree)
    parseTrees.append(tempTree)
    i=i+1

In [None]:
len(parseTrees)

Loading candidates and creating parse trees

In [None]:
potential="./potential/"

In [None]:
potentialParseTrees=dict()

In [None]:
for file in os.listdir(potential):
    print(file)
    candidate=open(potential+file)
    rawtext=candidate.read()
    rawtext = strip_headers(rawtext).strip()
    candidate=rawtext.replace('\n',' ')
    candidate=sent_tokenize(candidate)
    candidate = list(filter(lambda x: len(x)>1, candidate))
    pTrees=list()
    for sent in candidate:
        sentParse=getNLPToks(sent)
        tempTree=tree()
        generateTree(sentParse['parse'],tempTree)
        flipTree(tempTree)
        pTrees.append(tempTree)
    potentialParseTrees[file]=pTrees
    
    

In [None]:
allScores=list()
i=0
for tr in parseTrees:
#     print(i)
    if i%10==0:
        print(i)
    sentScoreDict=dict()
    for file in os.listdir(potential):
#         print(file)
        bookTrees=potentialParseTrees[file]
        df=list()
        for bTree in bookTrees:
            (rscore_st, nscore_st) = MoschittiPT(tr, bTree, 0.8, 1, 1)
            df.append(nscore_st)
#         print(df)
        sentScoreDict[file]=df
    allScores.append(sentScoreDict)
#     print('over')
    i=i+1
            

In [None]:
len(allScores)

In [None]:
text[174]

In [None]:
allScores=allScores[:-1]

In [None]:
books=dict()
for file in os.listdir(potential):
    print(file)
    candidate=open(potential+file)
    rawtext=candidate.read()
    rawtext = strip_headers(rawtext).strip()
    candidate=rawtext.replace('\n',' ')
    candidate=sent_tokenize(candidate)
    candidate = list(filter(lambda x: len(x)>1, candidate))
    books[file]=candidate

In [None]:
for i in range(60,70):
    print('Sentence',i)
    print('Original Sent',text[i])
    for book in os.listdir(potential):
        print(book)
        maxIndex=allScores[i][book].index(max(allScores[i][book]))
        print('Score',allScores[i][book][maxIndex])
        print('Similar sentence:',books[book][maxIndex])
    print('\n\n')

In [None]:
allScores[600]['2.txt'].index(max(allScores[600]['2.txt']))

In [None]:
len(allScores)

In [None]:
len(allScores[0]['5.txt'])

In [None]:
scoreTuples=list()

In [None]:
for i in range(len(allScores)):
    scoreTuple=(0,0,0,0)
    for fl in os.listdir(potential):
        scores=allScores[i][fl]
        for j in range(len(scores)):
            scoreTuples.append((i,fl,j,scores[j]))

In [None]:
len(scoreTuples)

In [None]:
scoreTuples.sort(key=lambda tup: tup[3],reverse=True)

In [None]:
print(scoreTuples[0:100])

### Testing on Bible sentences

Two related sentences - high score

In [None]:
sent1='Behold, a virgin shall conceive and bear a son, and his name shall be called Emmanuel'

In [None]:
sent2='behold, a virgin shall conceive in the womb, and shall bring forth a son, and thou shalt call his name Emmanuel.'

In [None]:
getMoschittiScore(sent1,sent2)

Two related sentences - high score

In [None]:
sent3='And thou, Bethlehem, in the land of Juda, art not the least among the princes of Juda: for out of thee shall come a Governor, that shall rule my people Israel'

In [None]:
sent4='And thou, Bethleem, house of Ephratha, art few in number to be reckoned among the thousands of Juda; yet out of thee shall one come forth to me, to be a ruler of Israel'

In [None]:
getMoschittiScore(sent3,sent4)

Two sentences that are not highly related, not such a high score

In [None]:
getMoschittiScore(sent1,sent3)

Similar sentences

In [None]:
sent5='In Rama was there a voice heard, lamentation, and weeping, and great mourning, Rachel weeping for her children, and would not be comforted because they are not.'

In [None]:
sent6='A voice was heard in Rama, of lamentation, and of weeping, and wailing; Rachel would not cease weeping for her children, because they are not.'

In [None]:
getMoschittiScore(sent5,sent6)

Not very similar

In [None]:
getMoschittiScore(sent5,sent3)

Similar Sentences

In [None]:
sent7=' Then saith Jesus unto them, All ye shall be offended because of me this night: for it is written, I will smite the shepherd, and the sheep of the flock shall be scattered abroad.'

In [None]:
sent8='Awake, O sword, against my shepherds, and against the man who is my citizen, saith the Lord Almighty: smite the shepherds, and draw out the sheep: and I will bring mine hand upon the little ones'

In [None]:
getMoschittiScore(sent7,sent8)

Not very similar but still a reasonably high score (False positive), might be a parsing error

In [None]:
getMoschittiScore(sent7,sent3)

Similar sentences

In [None]:
sent9='Jesus said unto him, Thou shalt love the Lord thy God with all thy heart, and with all thy soul, and with all thy mind.'

In [None]:
sent10='And thou shalt love the Lord thy God with all thy mind, and with all thy soul, and all thy strength'

In [None]:
getMoschittiScore(sent9,sent10)

Not similar

In [None]:
getMoschittiScore(sent9,sent1)

### Testing on chunks of the bible 

In [None]:
potential="./potential/"
booksList=os.listdir(potential)

In [None]:
test="./new/matthew"
testB=open(test)
raw=testB.read()
text = strip_headers(raw).strip()
text=text.replace('\n',' ')
text=text.replace(':','. ')
text=sent_tokenize(text)
text = list(filter(lambda x: len(x)>5, text))

In [None]:
i=0
parseTrees=list()
for sent in text:
    print(i)
    sentParse=getNLPToks(sent)
    tempTree=tree()
    generateTree(sentParse['parse'],tempTree)
    flipTree(tempTree)
    parseTrees.append(tempTree)
    i=i+1

In [None]:
pickling_on = open("./tempOutput/parseTrees.pickle","wb")
pickle.dump(parseTrees, pickling_on)

In [None]:
potential="./potential/"

In [None]:
potentialParseTrees=dict()

In [None]:
for file in os.listdir(potential):
    print(file)
    candidate=open(potential+file)
    rawtext=candidate.read()
    rawtext = strip_headers(rawtext).strip()
    candidate=rawtext.replace('\n',' ')
    candidate=rawtext.replace(':','. ')
    candidate=sent_tokenize(candidate)
    candidate = list(filter(lambda x: len(x)>5, candidate))
    pTrees=list()
    for sent in candidate:
        sentParse=getNLPToks(sent)
        tempTree=tree()
        generateTree(sentParse['parse'],tempTree)
        flipTree(tempTree)
        pTrees.append(tempTree)
    potentialParseTrees[file]=pTrees
    

In [None]:
pickling_on = open("./tempOutput/potentialParseTrees.pickle","wb")
pickle.dump(potentialParseTrees, pickling_on)

In [None]:
allScores=list()
i=0
for tr in parseTrees:
#     print(i)
    if i%10==0:
        print(i)
    sentScoreDict=dict()
    for file in os.listdir(potential):
#         print(file)
        bookTrees=potentialParseTrees[file]
        df=list()
        for bTree in bookTrees:
            (rscore_st, nscore_st) = MoschittiPT(tr, bTree, 0.8, 1, 1)
            df.append(nscore_st)
#         print(df)
        sentScoreDict[file]=df
    allScores.append(sentScoreDict)
#     print('over')
    i=i+1
            

In [None]:
pickling_on = open("./tempOutput/allScores.pickle","wb")
pickle.dump(allScores, pickling_on)

In [None]:
pickle_off = open("./tempOutput/allScores.pickle","rb")
allScores = pickle.load(pickle_off)

In [None]:
books=dict()
for file in os.listdir(potential):
    print(file)
    candidate=open(potential+file)
    rawtext=candidate.read()
    rawtext = strip_headers(rawtext).strip()
    candidate=rawtext.replace('\n',' ')
    candidate=rawtext.replace(':','. ')
    candidate=sent_tokenize(candidate)
    candidate = list(filter(lambda x: len(x)>5, candidate))
    books[file]=candidate

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) 

In [None]:
index2word_set = set(model.wv.index2word)

In [None]:
scoreTuples=list()

In [None]:
for i in range(len(allScores)):
    scoreTuple=(0,0,0,0)
    s1v=avg_feature_vector(text[i],model,300,index2word_set)
    for fl in os.listdir(potential):
        scores=allScores[i][fl]
        for j in range(len(scores)):
            s2v=avg_feature_vector(books[fl][j],model,300,index2word_set)
            semanticScore=1 - spatial.distance.cosine(s1v, s2v)
            scoreTuples.append((i,fl,j,scores[j],semanticScore,(scores[j]+semanticScore)/2))

In [None]:
len(scoreTuples)

In [None]:
scoreTuples.sort(key=lambda tup: tup[5],reverse=True)

In [None]:
scoreTuples[0:10]

In [None]:
for t in scoreTuples[0:10]:
    print('Original Sentence: ',text[t[0]])
    print('Similar Sentence is from: ',t[1])
    print('Score: ',t[3])
    print(books[t[1]][t[2]])
    print('\n\n')

### New approach: Semantic filtering using TFIDF before parsing and final semantic filtering

In [None]:
potential="./potential/"
booksList=os.listdir(potential)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
test="./new/matthew"
testB=open(test)
raw=testB.read()
text = strip_headers(raw).strip()
text=text.replace('\n',' ')
text=text.replace(':','. ')
text=sent_tokenize(text)
text = list(filter(lambda x: len(x)>5, text))

In [None]:
books=dict()
for file in booksList:
    print(file)
    candidate=open(potential+file)
    rawtext=candidate.read()
    rawtext = strip_headers(rawtext).strip()
    candidate=rawtext.replace('\n',' ')
    candidate=rawtext.replace(':','. ')
    candidate=sent_tokenize(candidate)
    candidate = list(filter(lambda x: len(x)>5, candidate))
    books[file]=candidate

TF-IDF based filtering

In [None]:
corpus=[]
corpus=corpus+text
for fl in os.listdir(potential):
    corpus=corpus+books[fl]

In [None]:
vectorizer = TfidfVectorizer(min_df=1)

In [None]:
X = vectorizer.fit_transform(corpus)

In [None]:
X.shape

In [None]:
tfIDFScores=[]
for i in range(len(text)):
    scoresDict={}
    j=len(text)
    for fl in booksList:
        bookScore=[]
        for k in range(len(books[fl])):
#             print(k)
            j=len(text)+k
#             print(j)
            simScore=1-spatial.distance.cosine(X[i].toarray(), X[j].toarray())
            bookScore.append((simScore,k))
        scoresDict[fl]=bookScore
    tfIDFScores.append(scoresDict)
        


In [None]:
for sent in tfIDFScores:
    for book in booksList:
        sent[book]=list(filter(lambda tup: tup[0]>0.2,sent[book]))

In [None]:
reducedSentences=dict()
for book in booksList:
    reducedSentences[book]=list()

In [None]:
for sent in tfIDFScores:
    for book in booksList:
        reducedSentences[book]=reducedSentences[book]+[x[1] for x in sent[book]]

In [None]:
for book in booksList:
    reducedSentences[book]=list(set(reducedSentences[book]))

In [None]:
len(reducedSentences['isaiah.txt'])

In [None]:
reducedBooks=dict()
for book in booksList:
    reducedBooks[book]=list()

In [None]:
for book in booksList:
    for sent in reducedSentences[book]:
        reducedBooks[book].append(books[book][sent])

In [None]:
test="./new/matthew"
testB=open(test)
raw=testB.read()
text = strip_headers(raw).strip()
text=text.replace('\n',' ')
text=text.replace(':','. ')
text=sent_tokenize(text)
text = list(filter(lambda x: len(x)>5, text))

In [None]:
i=0
parseTrees=list()
for sent in text:
    print(i)
    sentParse=getNLPToks(sent)
    tempTree=tree()
    generateTree(sentParse['parse'],tempTree)
    flipTree(tempTree)
    parseTrees.append(tempTree)
    i=i+1

In [None]:
pickle_off = open("./tempOutput/parseTrees.pickle","rb")
parseTrees = pickle.load(pickle_off)

In [None]:
potentialParseTrees=dict()

In [None]:
len(books['isaiah.txt'])

In [None]:
len(reducedBooks['isaiah.txt'])

In [None]:
for book in booksList:
    print(book)
    candidate=reducedBooks[book]
    pTrees=list()
    for sent in candidate:
        sentParse=getNLPToks(sent)
        tempTree=tree()
        generateTree(sentParse['parse'],tempTree)
        flipTree(tempTree)
        pTrees.append(tempTree)
    potentialParseTrees[book]=pTrees
    

In [None]:
allScores=list()
i=0
for tr in parseTrees:
#     print(i)
    if i%10==0:
        print(i)
    sentScoreDict=dict()
    for book in booksList:
#         print(file)
        bookTrees=potentialParseTrees[book]
        df=list()
        for bTree in bookTrees:
            (rscore_st, nscore_st) = MoschittiPT(tr, bTree, 0.8, 1, 1)
            df.append(nscore_st)
#         print(df)
        sentScoreDict[book]=df
    allScores.append(sentScoreDict)
#     print('over')
    i=i+1
            

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) 
index2word_set = set(model.wv.index2word)

In [None]:
scoreTuples=list()

In [None]:
for i in range(len(allScores)):
    s1v=avg_feature_vector(text[i],model,300,index2word_set)
    for fl in booksList:
        scores=allScores[i][fl]
        for j in range(len(scores)):
            s2v=avg_feature_vector(reducedBooks[fl][j],model,300,index2word_set)
            semanticScore=1 - spatial.distance.cosine(s1v, s2v)
            scoreTuples.append((i,fl,j,scores[j],semanticScore,(scores[j]+semanticScore)/2))

In [None]:
scoreTuples.sort(key=lambda tup: tup[5],reverse=True)

In [None]:
for t in scoreTuples[0:10]:
    print('Original Sentence: ',text[t[0]])
    print('Similar Sentence is from: ',t[1])
    print('Syntactic Score: ',t[3])
    print('Semantic Score: ',t[4])
    print(reducedBooks[t[1]][t[2]])
    print('\n\n')

In [None]:
pickle_off = open("./tempOutput/parseTrees.pickle","rb")
parseTrees = pickle.load(pickle_off)

In [None]:
sent1='23 Behold, a virgin shall be with child, and shall bring forth a son, and they shall call his name Emmanuel, which being interpreted is, God with us.'

In [None]:
out1=getNLPToks(sent1)

In [None]:
x=out1['parse']

In [None]:
s=''
for i in x:
    s=s+i

In [None]:
y=Tree.fromstring(s)

In [None]:
TreeView(y)._cframe.print_to_file('output.ps')

In [None]:
from nltk.draw.tree import TreeView

In [None]:
os.system('convert output.ps output.png')

In [None]:
from IPython.display import Image
Image("output.png")

### Using jacardian index for initial filtering

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')
stopwords.append('thou')

In [None]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()

In [None]:
def jacardScore(a, b):
    tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) if token.lower().strip(string.punctuation) not in stopwords]
    tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) if token.lower().strip(string.punctuation) not in stopwords]
    ratio = len(set(tokens_a).intersection(tokens_b)) / float(len(set(tokens_a).union(tokens_b)))
    return ratio

In [None]:
potential="./potential/"
booksList=os.listdir(potential)

In [None]:
test="./new/matthew"
testB=open(test)
raw=testB.read()
text = strip_headers(raw).strip()
text=text.replace('\n',' ')
text=text.replace(':','. ')
text=sent_tokenize(text)
text = list(filter(lambda x: len(x)>5, text))

In [None]:
books=dict()
for file in booksList:
    print(file)
    candidate=open(potential+file)
    rawtext=candidate.read()
    rawtext = strip_headers(rawtext).strip()
    candidate=rawtext.replace('\n',' ')
    candidate=rawtext.replace(':','. ')
    candidate=sent_tokenize(candidate)
    candidate = list(filter(lambda x: len(x)>5, candidate))
    books[file]=candidate

In [None]:
jacardScores=[]
for i in range(len(text)):
    scoresDict={}
    for book in booksList:
        bookScore=[]
        for k in range(len(books[book])):
            simScore=jacardScore(text[i], books[book][k])
            bookScore.append((simScore,k))
        scoresDict[book]=bookScore
    jacardScores.append(scoresDict)
        

In [None]:
for sent in jacardScores:
    for book in booksList:
        sent[book]=list(filter(lambda tup: tup[0]>0.15,sent[book]))

In [None]:
reducedSentences=dict()
for book in booksList:
    reducedSentences[book]=list()

In [None]:
for sent in jacardScores:
    for book in booksList:
        reducedSentences[book]=reducedSentences[book]+[x[1] for x in sent[book]]

In [None]:
for book in booksList:
    reducedSentences[book]=list(set(reducedSentences[book]))

In [None]:
len(reducedSentences['isaiah.txt'])

In [None]:
reducedBooks=dict()
for book in booksList:
    reducedBooks[book]=list()

In [None]:
for book in booksList:
    for sent in reducedSentences[book]:
        reducedBooks[book].append(books[book][sent])

In [None]:
i=0
parseTrees=list()
for sent in text:
    print(i)
    sentParse=getNLPToks(sent)
    tempTree=tree()
    generateTree(sentParse['parse'],tempTree)
    flipTree(tempTree)
    parseTrees.append(tempTree)
    i=i+1

In [49]:
pickle_off = open("./tempOutput/parseTrees.pickle","rb")
parseTrees = pickle.load(pickle_off)

In [50]:
potentialParseTrees=dict()

In [53]:
for book in booksList:
    print(book)
    candidate=reducedBooks[book]
    pTrees=list()
    for sent in candidate:
        sentParse=getNLPToks(sent)
        tempTree=tree()
        generateTree(sentParse['parse'],tempTree)
        flipTree(tempTree)
        pTrees.append(tempTree)
    potentialParseTrees[book]=pTrees

isaiah.txt
micah.txt


In [54]:
allScores=list()
i=0
for tr in parseTrees:
#     print(i)
    if i%10==0:
        print(i)
    sentScoreDict=dict()
    for book in booksList:
#         print(file)
        bookTrees=potentialParseTrees[book]
        df=list()
        for bTree in bookTrees:
            (rscore_st, nscore_st) = MoschittiPT(tr, bTree, 0.8, 1, 1)
            df.append(nscore_st)
#         print(df)
        sentScoreDict[book]=df
    allScores.append(sentScoreDict)
#     print('over')
    i=i+1
            

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190


In [55]:
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) 
index2word_set = set(model.wv.index2word)

  


In [56]:
scoreTuples=list()

In [57]:
for i in range(len(allScores)):
    s1v=avg_feature_vector(text[i],model,300,index2word_set)
    for fl in booksList:
        scores=allScores[i][fl]
        for j in range(len(scores)):
            s2v=avg_feature_vector(reducedBooks[fl][j],model,300,index2word_set)
            semanticScore=1 - spatial.distance.cosine(s1v, s2v)
            scoreTuples.append((i,fl,j,scores[j],semanticScore,(scores[j]+semanticScore)/2))

In [58]:
scoreTuples.sort(key=lambda tup: tup[5],reverse=True)

In [59]:
for t in scoreTuples[0:5]:
    print('Original Sentence: ',text[t[0]])
    print('Similar Sentence is from: ',t[1])
    print('Syntactic Score: ',t[3])
    print('Semantic Score: ',t[4])
    print(reducedBooks[t[1]][t[2]])
    print('\n\n')

Original Sentence:  23 Behold, a virgin shall be with child, and shall bring forth a son, and they shall call his name Emmanuel, which being interpreted is, God with us.
Similar Sentence is from:  isaiah.txt
Syntactic Score:  0.911016855099
Semantic Score:  0.88051789999
14 Therefore
the Lord himself shall give you a sign; Behold, a virgin shall
conceive, and bear a son, and shall call his name Immanuel.



Original Sentence:  for it is written, Thou shalt worship the Lord thy God, and him only shalt thou serve.
Similar Sentence is from:  micah.txt
Syntactic Score:  0.883703966328
Semantic Score:  0.885921001434
14 Thou shalt eat, but not be satisfied; and thy casting down shall
be in the midst of thee; and thou shalt take hold, but shalt not
deliver; and that which thou deliverest will I give up to the sword.



Original Sentence:  21 Ye have heard that it was said by them of old time, Thou shalt not kill; and whosoever shall kill shall be in danger of the judgment.
Similar Sentence i

In [60]:
text

['The Gospel According to Saint Matthew   1.',
 '1 The book of the generation of Jesus Christ, the son of David, the son of Abraham.',
 '2 Abraham begat Isaac; and Isaac begat Jacob; and Jacob begat Judas and his brethren; 1.',
 '3 And Judas begat Phares and Zara of Thamar; and Phares begat Esrom; and Esrom begat Aram; 1.',
 '4 And Aram begat Aminadab; and Aminadab begat Naasson; and Naasson begat Salmon; 1.',
 '5 And Salmon begat Booz of Rachab; and Booz begat Obed of Ruth; and Obed begat Jesse; 1.',
 '6 And Jesse begat David the king; and David the king begat Solomon of her that had been the wife of Urias; 1.',
 '7 And Solomon begat Roboam; and Roboam begat Abia; and Abia begat Asa; 1.',
 '8 And Asa begat Josaphat; and Josaphat begat Joram; and Joram begat Ozias; 1.',
 '9 And Ozias begat Joatham; and Joatham begat Achaz; and Achaz begat Ezekias; 1.',
 '10 And Ezekias begat Manasses; and Manasses begat Amon; and Amon begat Josias; 1.',
 '11 And Josias begat Jechonias and his brethren,