# Automatic Fact Verification system 

Author Name: Saransh Srivastava 

Student ID: 1031073

### Document retrieval
Read documents into memory as inverted index matrix

In [None]:
#############
##### Read all documents and make an inverted index
#############

from nltk.corpus import stopwords
import nltk
from collections import defaultdict
from collections import Counter
import glob
import time
import unicodedata



nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')


path = 'wiki-pages-text/*.txt'
files = glob.glob(path)
inv_indx = defaultdict(Counter)
dict_doc_all = {}
for fname in files:
    file_name = fname.split('/')[-1]
    with open(fname) as f:
        for idx,text in enumerate(f):
            line = text.split()
            text = line[2:]
            pid_raw = unicodedata.normalize('NFD',line[0])
            for pid in pid_raw.split('_'):
                text.append(pid)
                text.append(pid)
                text.append(pid)
                text.append(pid)
                text.append(pid)
                text.append(pid)
                text.append(pid)
                text.append(pid)
                text.append(pid)
            for word in text:
                inv_indx[word.lower()][fname] += 1


#### Helper functions to compute sentence similarity  using Wordnet

In [None]:
import numpy
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
 
def penn_to_wn(tag):
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
 
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None


def sentence_similarity(sentence1, sentence2):
    # sentence similarity computation using Wordnet
    # Tokenize and tag
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]
 
    score, count = 0.0, 0
 
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        try:
            best_score = ([synset.path_similarity(ss) for ss in synsets2])
            best_score = [x for x in best_score if x is not None]
            if best_score:
                best_score = max(best_score)


            # Check that the similarity could have been computed
            if best_score:
                score += best_score
                count += 1
        except WordNetError:
            print("WordNetError detected!!")
            pass
        except:
            pass
     
    # Average the values
    if count !=0:
        score /= count
    return score

def symmetric_sentence_similarity(sentence1, sentence2):
    return (sentence_similarity(sentence1, sentence2) + sentence_similarity(sentence2, sentence1)) / 2 


#### Helper function to compute jacob similarity
We use this similarity to order top 10 retrieved sentences

In [None]:
def sentence_jacob_similarity(sentence1,sentence2):
    a = set(i.lower() for i in sentence1.split()) 
    b = set(j.lower() for j in sentence2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))    

def symmetric_sentence_jacob_similarity(sentence1,sentence2):
    return (sentence_jacob_similarity(sentence1, sentence2) + sentence_jacob_similarity(sentence2, sentence1)) / 2 


#### Helper functions to get top N sentences from top K documents

In [None]:
###############
#### get top N sentences from the doc PIDS stored in memory
###############

def getSentence(dict_doc,id_list):
    #############
    ### get all sentence text from pid,sid list
    ############
    result = []
    for ids in id_list:
        result.append(dict_doc[(ids[0],str(ids[1]))])
    return result

def pairwise(myList):
    tuples = [x+'_'+y for x,y in zip(myList, myList[1:])]
    return tuples

def getPriomaryTopicList(topic):
    ##########
    ## Preference order:
    ## 1. pair-wise PNoun > Pnoun 
    ## 2. In order of appearance in the sentence
    ##########
    
    nouns = []
    pnoun = []
    for word,pos in nltk.pos_tag(nltk.word_tokenize(str(topic))):
        #print(word,pos)
        if (pos == 'NNP' or pos == 'NNPS'):
            pnoun.append(word)      
        if (pos == 'NN' or pos == 'NNS' or pos == 'NNPS'):
             nouns.append(word)
    properN = ""
    couples = pairwise(pnoun)
    pnoun = couples + pnoun
    pnoun += nouns
    return pnoun

def doesExists(word, word_dict):
    dict_keys = [key.split('_') for key in word_dict.keys()]
    if dict_keys:
        if word in dict_keys[0]:
            return True
    return False


def getTopNSentIDFromPID(topics,k):
    ##############
    #### Make a dict of topic: retrieved sentences
    #### if a new topic is a subset of the values in dict.keys() (eg: Bosnan exists in Paul_Bosnan)
    #### then skip 'Bosnan': else look for sentences which has pid 'Bosnan'
    ##############
    dict_docL = getTopicEvidenceSentences(topics) # return relevant dict of (pid,sent): sentence
    result = []
    topics = removeStop(topics)
    topic_dict = {}
    topics = getPriomaryTopicList(topics)
    for topic in topics:
        if not doesExists(topic,topic_dict):
            for (pid,sid) in dict_docL.keys():
                if topic in pid:
                    topic_dict[topic] = 1
                    result.append((pid,sid))
                    if len(result) >= k:
                        return dict_docL,result
        
    return dict_docL,result

#### Helper functions to get top K documents from wikipedia corpus

In [None]:
##################
#### get top K documents for a query
##################

def getTopKDoc(topic , k):
    inv_idx = inv_indx
    can_doc = Counter()
    for word in topic.split():
        can_doc += inv_idx[word.lower()]
    res_list = [x[0] for x in can_doc.most_common(k)]
    return res_list

def removeStop(raw_words):
    stop_words = stopwords.words('english')
    tokenized_words = raw_words.split()
    separator = " "
    return separator.join([word for word in tokenized_words if word not in stop_words])
    
def getKeyWords(topic_raw):
    sentence = nltk.pos_tag(topic_raw.split())
    csent = nltk.ne_chunk(sentence)
    iob_tagged = tree2conlltags(csent)
    result = []
    for w,t,c in iob_tagged:
        if c != 'O':
            result.append(w)
    return(' '.join(result))
    
    
def getRealTopic(topic_raw):
    topic = removeStop(topic_raw)
    return topic


def getTopicEvidenceSentences(topic_raw):
    topic = getRealTopic(topic_raw)
    topKDoc = getTopKDoc(topic,7)
    for top in topic.split():
        for docum in getTopKDoc(top,1):
            if docum not in topKDoc:
                topKDoc.append(docum)
    dict_doc = {}
    for fname in topKDoc:
        file_name = fname.split('/')[-1]
        with open(fname) as f:
            for text in f:
                line = text.split()
                pid = unicodedata.normalize('NFD',line[0])
                sid = line[1]
                pid_split = pid.split('_')
                sent_raw = pid_split + line[2:]
                sent = " ".join(sent_raw)
                dict_doc[(pid,sid)] = sent
    return dict_doc

def getBestDoc(query_text,k):
    dict_doc,textsIds = getTopNSentIDFromPID(query_text,k)
    return dict_doc,textsIds

def tryQuery(query_text):
    query_text = getRealTopic(query_text)
    dict_doc,result_documentsIds = getBestDoc(query_text,500)
    return dict_doc,result_documentsIds

def storeData(dataObj,filename):
    with open(filename,'w') as f:
        json.dump(dataObj, f)

#### Helper function to get most(=size) similar sentences for a query (claim_test)

In [None]:
def getEvidenceID(claim_text,size):
    dict_doc,res_ID = tryQuery(claim_text)
    focus_sentence = claim_text
    sentences = getSentence(dict_doc,res_ID)
    result = []
    for idx,sentence in enumerate(sentences):
        score1 = symmetric_sentence_similarity(focus_sentence, sentence)
        score2 = symmetric_sentence_jacob_similarity(focus_sentence, sentence)
        if score1 > 0.4 :
            result.append((score1,score2, res_ID[idx],sentence))
    result = list(set(result))
    result.sort(reverse=True)
    return result[:size]

def getEvidenceSent(evidences_list):
    result = []
    for evidence in evidences_list:
        if evidence[1]>0.09:
            result.append((evidence[1],evidence[0],evidence[2],evidence[3]))
    result.sort(reverse=True)
    return result
    
    
def getEvidence(source):
    df_Testlist = {}
    texts = []
    document = {}
    id = str(source[0])
    values = source[1]
    dict_value = {}
    dict_value['claim'] = values['claim']
    evidences = getEvidenceID(dict_value['claim'],10)
    evidences2 = getEvidenceSent(evidences)
    evid_list = []
    for evidence in evidences2:
        evid_list.append(list(evidence[2]))
        
    dict_value["evidence"] = evid_list
    document[id] = dict_value
    df_Testlist = {**df_Testlist, **document} 
    
    return df_Testlist

### Read test data and extracts best sentences matching it

In [None]:
##################
### Parallelizing: Read test data and get 'best' evidecnces
##################

import multiprocessing as mp
import time
import pandas as pd
import json
from collections import ChainMap

test_path = 'test-unlabelled.json'

cpu_count = mp.cpu_count() - 4
print("CPU count: " + str(cpu_count))
pool = mp.Pool(cpu_count)

results = []

with open(test_path) as f:
    jTestdata = json.load(f)

t3 = time.time()

result_objects = [pool.apply_async(getEvidence,
                                   args=(row,)) for row in jTestdata.items()]

results = [r.get() for r in result_objects]

pool.close()
pool.join()

final_result = dict(ChainMap(*results))

t4 = time.time()

print(t4-t3)

storeData(final_result,'testoutput-unlabelled.json')
