# Label Prediction
Code to predict based on every claim-evidence as a pair and then finding max of label assigned

In [1]:
import glob
import json
import pickle
import pandas as pd
import unicodedata

wiki_path = 'wiki-pages-text/*.txt'
test_path = 'testoutput-unlabelled.json'

LOGISTIC_MODEL = "logRegModel.sav"
VECTORIZER = "Vectorizer.sav"
ENCODER = "Encoder.sav"
ANN_MODEL = "SequentialANN.sav"

LOGIST_PREDICTION = "testoutput_baseline.json"
ANN_PREDICTION = "testoutput.json"


### Helper functions

In [22]:
def getDictionary(wiki_path):
    files = glob.glob(wiki_path)
    dict_doc = {}
    for fname in files:
        with open(fname) as f:
            for text in f:
                line = text.split()
                pid = unicodedata.normalize('NFD',line[0])
                sid = str(line[1])
                sent = " ".join(line[2:])
                dict_doc[(pid,sid)] = sent

    print(len(dict_doc.keys()))
    return dict_doc

    
def loadModel(filename):
    loaded_model = pickle.load(open(filename, 'rb'))
    return loaded_model

############
### Reading test data with evidence
############

def loadTestDataDF(dict_doc,test_path):
    with open(test_path) as f:
        jTestdata = json.load(f)

    df_list = []
    for source in jTestdata.items():
        texts = []
        id = source[0]
        values = source[1]
        evidence = values['evidence']
        claim = values['claim']
        for evi in evidence:
            pid = unicodedata.normalize('NFD',evi[0])
            sid = str(evi[1])
            df_list.append((id, claim ,(pid,sid), dict_doc[(pid,sid)]))

        if len(evidence) == 0:
            df_list.append((id,claim,("",0),""))

    df = pd.DataFrame(df_list,columns=['id','claim','evidenceID','evidence'])
    return df

    
def mergeDFs(dataDF,modelDF):
    comb_df = dataDF.join(modelDF).drop(['claim_evidence','evidence'],axis=1)
    grouped_df = comb_df.groupby(['id','label']).count()
    grouped_max_df = grouped_df.loc[grouped_df.groupby(["id"])["claim"].idxmax()] 
    merge_df = pd.merge(comb_df,
                       grouped_max_df,on=['id','label'],how='inner')
    merge2_df = merge_df.groupby(['id']).head(5)
    testlist = merge2_df.groupby(['id','claim_x','label'])['evidenceID_x'].apply(list)
    final_df = pd.DataFrame(testlist)
    return final_df

def storeDFasJSON(dataDF,filename):
    document = {}
    for index, row in dataDF.iterrows():
        element = {}
        element['claim'] = index[1]
        element['label'] = index[2]
        element['evidence'] = list(row)[0]
        if index[2] == 'NOT ENOUGH INFO':
            element['evidence'] = []
        document[index[0]] = element
    json.dump(document, open(filename, 'w'))

#### Load wiki data into memory

In [23]:
dict_doc = getDictionary(wiki_path)

25248313


# Logistic Regression predictor
This method predicts on test data based on stored logistic regression trained over the training set

In [26]:
############
### Logistic regression prediction on the claium-evidence pair
############

def predictLogisticRegression(vectorizer,encoder,sentences):
    classifier = loadModel(LOGISTIC_MODEL)
    test_sentences  = vectorizer.transform(sentences)
    print(test_sentences.shape)
    test_labels = classifier.predict(test_sentences)
    final_test_label = encoder.inverse_transform(test_labels)
    return(final_test_label)

testdata_df = loadTestDataDF(dict_doc,test_path)
vectorizer = loadModel(VECTORIZER)
encoder = loadModel(ENCODER)
sentences = [sent for sent in testdata_df['claim'] + testdata_df['evidence']]
final_test_label = predictLogisticRegression(vectorizer,encoder,sentences)

logistic_result = []
for idx, sent in enumerate(sentences):
    logistic_result.append((final_test_label[(idx)],sent))

logistic_df = pd.DataFrame(logistic_result,columns=['label','claim_evidence'])
final_logistic_df = mergeDFs(testdata_df,logistic_df)
storeDFasJSON(final_logistic_df,LOGIST_PREDICTION)
final_logistic_df.head()


(18780, 46358)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,evidenceID_x
id,claim_x,label,Unnamed: 3_level_1
100038,Ripon College's student number totaled in at around 840.,SUPPORTS,"[(Ripon_College_-LRB-Wisconsin-RRB-, 1), (List..."
100083,"Kesha was baptized on March 1st, 1987.",NOT ENOUGH INFO,"[(March_2007_lunar_eclipse, 7)]"
100169,Birthday Song (2 Chainz song) was banned by Sonny Digital.,NOT ENOUGH INFO,"[(, 0)]"
100234,The University of Illinois at Chicago is a college.,SUPPORTS,"[(The_University_of_Illinois_vs._a_Mummy, 0), ..."
100359,French Indochina was officially known as the Indochinese Union in England after 1887.,SUPPORTS,"[(French_Indochinese_piastre, 0), (French_Indo..."


# Sequential ANN
prediction on testing data based on trained and stored ann

In [33]:
############
### Predicting: Sequential ANN prediction on the evidence-claium pair
############

def predictANNSequential(vectorizer,encoder,sentences):
    model = loadModel(ANN_MODEL)
    vectorizer = loadModel(VECTORIZER)
    encoder = loadModel(ENCODER)
    test_sentences  = vectorizer.transform(sentences)
    test_labels = model.predict_classes(test_sentences)
    final_test_label = encoder.inverse_transform(test_labels)
    return final_test_label


testdata_df = loadTestDataDF(dict_doc,test_path)
sentences = [sent for sent in testdata_df['claim'] + testdata_df['evidence']]
vectorizer = loadModel(VECTORIZER)
encoder = loadModel(ENCODER)
final_test_label = predictANNSequential(vectorizer,encoder,sentences)

sequential_result = []
for idx, sent in enumerate(sentences):
    sequential_result.append((final_test_label[(idx)],sent))
    
sequential_df = pd.DataFrame(sequential_result,columns=['label','claim_evidence'])
final_ann_df = mergeDFs(testdata_df,sequential_df)
storeDFasJSON(final_ann_df,ANN_PREDICTION)
final_ann_df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,evidenceID_x
id,claim_x,label,Unnamed: 3_level_1
100030,Steve Wozniak designed homes.,NOT ENOUGH INFO,"[(, 0)]"
100046,The Guthrie Theater's second building began operating in 1963.,SUPPORTS,"[(Theatre_building,_Zrenjanin, 0), (Guthrie_Th..."
100060,Kareena Kapoor was initially successful.,SUPPORTS,"[(Kareena_Kapoor, 6), (Kareena_Kapoor, 1), (Ka..."
100088,Men in Black II stars Patrick Stewart.,NOT ENOUGH INFO,"[(Patrick_Stewart, 6), (II_-LRB-Boyz_II_Men_al..."
100138,Brazilian jiu-jitsu is a form of armed combat.,SUPPORTS,"[(Blake_canonical_form, 1), (Boyce–Codd_normal..."


In [34]:
final_ann_df.shape

(14997, 1)