In [21]:
class ProcessedClaim():
    
    def createPosTags(self,paragraph,singleLine=0):
        posTagContent1=[]
        if(singleLine==0):
            for line in paragraph:
                line= line.split(' ',2)
                sentenceID=line[1]
                content=line[2]
                posTagContent1.append(self.getChunks(content))
        else:
            content=paragraph
            posTagContent1.append(self.getChunks(content))
        return posTagContent1
    
    def getAntonyms(self,term):
        synsets=wordnet.synsets(term)
        for synset in synsets:
            if(synset.pos()=='a'):
                return synset.lemmas()[0].antonyms()[0].name()
        return term

    def getNegation(self,prev_term,term):
        if prev_term=="not":
            return term
        else:
            return "not "+term
    
    def getChunks(self,content):
        chunks = []
        contentToken = word_tokenize(content)
        nc_pos = pos_tag(contentToken)

        prevPosition = nc_pos[0][1]
        entity = {"pos":prevPosition,"chunk":[]}
        for c_node in nc_pos:
            (token,pos) = c_node
            if pos == prevPosition:
                prevPosition = pos
                entity["chunk"].append(token)
            elif prevPosition in ["DT","JJ"]:
                prevPosition = pos
                entity["pos"] = pos
                entity["chunk"].append(token)
            else:
                if not len(entity["chunk"]) == 0:
                    chunks.append((entity["pos"]," ".join(entity["chunk"])))
                    entity = {"pos":pos,"chunk":[token]}
                    prevPosition = pos
        if not len(entity["chunk"]) == 0:
            chunks.append((entity["pos"]," ".join(entity["chunk"])))
        return chunks
    

In [22]:
#STEP 1 compute file list
import os
dirName="/wiki-pages-text-test/"

# r=root, d=directories, f = files
files=[]
for r, d, f in os.walk(os.getcwd()+dirName):
    for file in f:
        if '.txt' in file:
            files.append(os.path.join(r, file))
print("Successfully added",len(files),"files.")

Successfully added 110 files.


In [23]:
# STEP 2 computeTFIDF for corpus
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

extraStopWords=["@","^","\"","(",")","*","_","\\",":",";","`","!","-RRB-","-LRB-"]

for stopW in extraStopWords:
    stopWords.add(stopW)

#STEP 2a Tokenise the text

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

tfidf_vectorizer = TfidfVectorizer(norm=None, ngram_range=(1,2),stop_words=stopWords,lowercase=True,tokenizer=tokenize)




#STEP 2b Extract Page content   
pageList={}
pageContent=[]
for file in files:
    f=open(file,"r",encoding="utf8")
    for line in f:
        line= line.split(' ',2)
        pageID=line[0]
        sentenceID=line[1]
        content=line[2]
        pageList[pageID,sentenceID]=content
        pageContent.append(content)
    f.close()
       
print("Page content Extracted")

        

Page content Extracted


[nltk_data] Downloading package stopwords to /Users/ola/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
#STEP 2c process the lines in the pages to merge them together
prevPageID=""
contentList=[]
pageIdList=[]
newContent=""
for (pageID,sentenceID),content in pageList.items():
    if prevPageID!=pageID:
        prevPageID=pageID
        pageIdList.append(pageID)
        contentList.append(newContent)
        newContent=""
        newContent=newContent+" "+content
    else:
        newContent=newContent+" "+content



print("Corpus lines merged")



Corpus lines merged


In [25]:
#STEP 2d computeTFIDF Matrix
nltk.download('punkt')


tfidf_matrix = tfidf_vectorizer.fit_transform(contentList)
print(len(tfidf_vectorizer.get_feature_names()),"tfidf computed")
#print(tfidf_vectorizer.get_feature_names())

[nltk_data] Downloading package punkt to /Users/ola/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


19536 tfidf computed


In [27]:
#STEP 2e get corpus features
features=tfidf_vectorizer.get_feature_names()
print("Feature extracted")

Feature extracted


In [None]:
#STEP 2f PROCESS THE CLAIMS 
import nltk
nltk.download("punkt")
nltk.download("wordnet")
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag,ne_chunk
import json
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from collections import Counter 

with open('devset.json') as f:
    data = json.load(f)

claims=[]
processedC=ProcessedClaim()
processedClaims={}
for linenum in data:
    claim=data[linenum]["claim"]
    claims.append((linenum,claim))
    #process the claim. Add negation if possible and rephrase it to add antonyms if possible.
    processedClaims[claim]=claim
    posTagsForClaim=processedC.createPosTags(claim,1)
    tempPosTagsForClaim=posTagsForClaim[0]
    for i in range(len(tempPosTagsForClaim)):
        prev_term = tempPosTagsForClaim[i][1]
        if(i>0):
            prev_term = tempPosTagsForClaim[i-1][1]
            prev_posTag=tempPosTagsForClaim[i-1][0]
        term=tempPosTagsForClaim[i][1]
        posTag=tempPosTagsForClaim[i][0]
        if(posTag in ["JJ","JJR","JJS"]):
            new_term=processedC.getAntonyms(term)
            processedClaims[claim]=claim.replace(term, new_term)
        if(posTag in ["VB","VBD","VBG","VBN","VBP","VBZ"]):
            new_term=processedC.getNegation(prev_term,term)
            processedClaims[claim]=claim.replace(term, new_term)

print(len(processedClaims),"processed claims generated")


In [49]:
#STEP 3 GET TOP K PAGES FOR THE QUERY
import json
import numpy as np
from collections import Counter
from nltk.tree import Tree
from nltk import ne_chunk
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
import spacy
from spacy import displacy
from collections import Counter

nlp = spacy.load('en')

def createNERTags1(paragraph):
    doc = nlp(paragraph)
    cleaned = [y for y in doc if not y.is_stop and y.pos_!="PUNCT"  ]
    raw=[(x.lemma_,x.pos_) for x in cleaned]
    return raw

def matchNERTags1(qTags,pTags):
    match=[]
    matchCount=0
    for (qterm,qtag) in qTags:
       for (pterm,ptag) in pTags:
            if qtag==ptag :
                if qterm==pterm:
                    if qtag not in match: 
                        match.append(qtag)
                        matchCount=matchCount+1
                        break
    if(matchCount==len(qTags)):
        label="SUPPORTS"
    else:
        label="REFUTES"
    return label
#STEP 3a Find the top k page ranks



#STEP 3b Get the best matching page using cosine sim
import string
def stemTokens(tokens):
    return [PorterStemmer().stem(item) for item in tokens]
def normalize(text):
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    return stemTokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
def cosineSim(query, pageContent):
    vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
    tfidf = vectorizer.fit_transform([query, pageContent])
    return ((tfidf * tfidf.T).A)[0,1]


def createJSON(linenum,claim,label,evidencearray,outer_dict):
    inner_dict={}
    inner_dict["claim"]=claim
    inner_dict["label"]=label
    inner_dict["evidence"]=evidencearray
    outer_dict[linenum]=inner_dict
    with open('predicted.json', 'w',encoding="utf8") as outfile:  
        json.dump(outer_dict, outfile)
    outfile.close()
    #print(linenum,inner_dict)
    return outer_dict


def queryTokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        if item not in stems and item not in stopWords:
            stems.append(PorterStemmer().stem(item))
    return stems



#STEP 3b Get the best matching sentence using ngram sim
def nGramSimilarity(claim, sentence,nGram):
    ps = PorterStemmer()
   
    getNGram = lambda tokens,n:[ " ".join([tokens[index+i] for i in range(0,n)]) for index in range(0,len(tokens)-n+1)]
    qToken = queryTokenize(claim)
    sToken = queryTokenize(sentence)

    if(len(qToken) > nGram):
        q3gram = set(getNGram(qToken,nGram))
        s3gram = set(getNGram(sToken,nGram))
        #print (q3gram)
        #print (s3gram)
        if(len(s3gram) < nGram):
            return 0
        qLen = len(q3gram)
        sLen = len(s3gram)
        sim = len(q3gram.intersection(s3gram)) / len(q3gram.union(s3gram))
        #print(sim)
        return sim
    else:
        return 0

#STEP 3c EXTRACT THE ENTITIES

def extractEntities(posTagContent,pages):
    entities={}
    for i in range(0,len(posTagContent)):
        for posTag,term in posTagContent[i]:
            if(posTag=='NNP'):
                entities[term]=pages[mostSimilarPageID].splitlines()[i]
    return entities

with open('devset.json') as f:
    data = json.load(f)

outer_dict={}
claimCounter=0
for linenum in data:
    claim=data[linenum]["claim"]
    evidence=[]
    evidencearray=[]
    label=""
    query=data[linenum]["claim"]
    claimCounter=claimCounter+1
    print(claimCounter)
    #STEP 3a tokenise the query
    queryUniGram=queryTokenize(query)
    #print("query unigram created")
    #STEP 3b caculate ngram for queryTokens
    queryTokens=[]
    getNGram = lambda tokens,n:[ " ".join([tokens[index+i] for i in range(0,n)]) for index in range(0,len(tokens)-n+1)]
    queryNGram = set(getNGram(queryUniGram,2))
    
    #print("query ngram created")
    queryTokens = list(set(queryUniGram) | set(queryNGram))
    #print("final query token created")
    temptopKRanks=[]
    topKRanks=set()
    for term in queryTokens:
        if term in features:
            indexOfQueryTerm=features.index(term)
            temptopKRanks=tfidf_matrix[:,indexOfQueryTerm].toarray()
            # Find index of maximum value from 2D numpy array
            maxV = np.where(temptopKRanks == np.amax(temptopKRanks))
            listOfCordinates = list(zip(maxV[0], maxV[1]))
            topKRanks.add(pageIdList[maxV[0][0]-1])
            #print("max done")
            #print (term,pageIdList[maxI-1])
    
    pageCosSimilarity={}
    sentenceNGramSim={}
    tempPageList={}
    for (pageID,sentenceID),content in pageList.items():
        if pageID in topKRanks:
            tempPageList[(pageID,sentenceID)]=content
            pageCosSimilarity[pageID,sentenceID]=cosineSim(query,content)
    

    mostSimilarPageID=max(pageCosSimilarity, key=pageCosSimilarity.get)
  
    for (pageID,sentenceID),content in tempPageList.items():
        if pageID==mostSimilarPageID[0]:
            sentenceNGramSim[(pageID,sentenceID)]=nGramSimilarity(query,content,2)
    avg=0.0      
    for (pageID,sentenceID),sim in sentenceNGramSim.items():
        avg=avg+sim
    avg=avg/len(sentenceNGramSim)
    sentenceNGramSim=sorted(sentenceNGramSim.items(), key=lambda kv: kv[1],reverse=True)

    
    
    if(sentenceNGramSim[0][1]<avg):
        label="NOT ENOUGH INFO"
        outer_dict=createJSON(linenum,claim,label,evidencearray,outer_dict)
        continue
        
    
    qNERTags=createNERTags1(query)
    pNERTags=createNERTags1(tempPageList[sentenceNGramSim[0][0]])
    
    #label=matchNERTags(qNERTags,pNERTags)
    label=matchNERTags1(qNERTags,pNERTags)
    for i in range(len(sentenceNGramSim)):
        #print(sentenceNGramSim[i][1],avg)
        if sentenceNGramSim[i][1]>=avg:
            evidence=[]
            evidence.append(sentenceNGramSim[i][0][0])
            evidence.append(int(sentenceNGramSim[i][0][1]))
            evidencearray.append(evidence)
    outer_dict=createJSON(linenum,claim,label,evidencearray,outer_dict)
print(outer_dict)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ola/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/ola/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/ola/nltk_data...
[nltk_data]   Package words is already up-to-date!


1
0 ['List_of_San_Francisco_49ers_seasons', 1]
[['List_of_San_Francisco_49ers_seasons', 1]]
[['List_of_San_Francisco_49ers_seasons', 1]]
2
0 ['Tilda', 12]
[['Tilda', 12]]
[['Tilda', 12]]
3
0 ['Soul_Food_-LRB-film-RRB-', 0]
[['Soul_Food_-LRB-film-RRB-', 0]]
1 ['Soul_Food_-LRB-film-RRB-', 5]
[['Soul_Food_-LRB-film-RRB-', 0], ['Soul_Food_-LRB-film-RRB-', 5]]
[['Soul_Food_-LRB-film-RRB-', 0], ['Soul_Food_-LRB-film-RRB-', 5]]
4
0 ['Bill_Nershi', 0]
[['Bill_Nershi', 0]]
[['Bill_Nershi', 0]]
5


KeyboardInterrupt: 

In [None]:
#STEP 3 GET TOP K PAGES FOR THE QUERY
import json
import numpy as np

#STEP 3a Find the top k page ranks

#STEP 4b get corpus features
features=tfidf_vectorizer.get_feature_names()
print("Feature extracted")
#STEP 5 Get the best matching page using cosine sim
import string
def stemTokens(tokens):
    return [PorterStemmer().stem(item) for item in tokens]
def normalize(text):
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    return stemTokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
def cosineSim(query, pageContent):
    vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
    tfidf = vectorizer.fit_transform([query, pageContent])
    return ((tfidf * tfidf.T).A)[0,1]



def queryTokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        if item not in stems and item not in stopWords:
            stems.append(PorterStemmer().stem(item))
    return stems

with open('devset.json') as f:
    data = json.load(f)

for linenum in data:
    query=data[linenum]["claim"]
    print()
    print(query)
    #STEP 3a tokenise the query
    queryUniGram=queryTokenize(query)
    #print("query unigram created")
    #STEP 3b caculate ngram for queryTokens
    queryTokens=[]
    getNGram = lambda tokens,n:[ " ".join([tokens[index+i] for i in range(0,n)]) for index in range(0,len(tokens)-n+1)]
    queryNGram = set(getNGram(queryUniGram,2))
    print(queryNGram)
    #print("query ngram created")
    queryTokens = list(set(queryUniGram) | set(queryNGram))
    #print("final query token created")
    temptopKRanks=[]
    topKRanks=set()
    for term in queryTokens:
        if term in features:
            indexOfQueryTerm=features.index(term)
            temptopKRanks=tfidf_matrix[:,indexOfQueryTerm].toarray()
            i=0
            maxV=0
            for a in temptopKRanks:
                if a[0]>maxV:
                    maxV=a[0]
                    maxI=i
                i=i+1
            topKRanks.add(pageIdList[maxI-1])
            #print (term,pageIdList[maxI-1])
    print(topKRanks)
    pageCosSimilarity={}
    for (pageID,sentenceID),content in pageList.items():
        if pageID in topKRanks:
            pageCosSimilarity[pageID,sentenceID]=cosineSim(query,content)

    mostSimilarPageID=max(pageCosSimilarity, key=pageCosSimilarity.get)
    print ("Most Similar Page is ",mostSimilarPageID,)

