**Load Data**

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi
import pandas as pd
import json
import string
remove_punc = str.maketrans('', '', string.punctuation)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [2]:
print(pd.__version__)

1.4.0


In [3]:
dev_claim_file = "data/train-claims.json"
data = []
with open(dev_claim_file, "r") as file:
    data = json.load(file)

rows = []
for claim_id, item in data.items():
    row = {'claim_id': claim_id}
    row.update(item)
    rows.append(row)

df = pd.DataFrame(rows)


In [7]:
df.shape

(1228, 4)

In [88]:
df.head()

Unnamed: 0,claim_id,claim_text,claim_label,evidences
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1..."
1,claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]"
2,claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]"
3,claim-2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5..."
4,claim-2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72..."


In [13]:
df.iloc[0]["claim_text"]

'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'

In [4]:
evidence_file_path = "data/filtered_evidence.csv"

evi_df = pd.read_csv(evidence_file_path)

evi_df['evidence'] = evi_df['evidence'].fillna('')

evi_df


Unnamed: 0,evi_id,evidence
0,evidence-0,Bennet entrepreneur and agricultural scientist
1,evidence-1,his professional career at the age of eventual...
2,evidence-2,Boston Ladies of by Vampire Weekend
3,evidence-3,born was a professional ice hockey player who ...
4,evidence-4,He of function in mania psychosis and how oxyt...
...,...,...
1208172,evidence-1208822,Also on the property is a garage apartment
1208173,evidence-1208823,class
1208174,evidence-1208824,Dragon Storm game a game and collectible card ...
1208175,evidence-1208825,It that the which is so great a realm that fro...


In [90]:
evi_df.iloc[1937]["evidence"]

'also a solo version at a Peel session on which surfaced as a free single with the first of his second album This Year Model in the United Kingdom'

In [7]:
tokenized_claims = [word_tokenize(claim.lower().translate(remove_punc)) for claim in df['claim_text']]

In [11]:
tokenized_evidences = [word_tokenize(evidence_text.lower().translate(remove_punc)) for evidence_text in evi_df['evidence']]

In [12]:
def lemma_token(tokens):
    return [lemmatizer.lemmatize(tokens) for token in tokens if token not in stop_words]


**Use BM25**

In [13]:
bm25 = BM25Okapi(tokenized_evidences)
import numpy as np

In [14]:
def choose_n_evidences(bm25, claim_tokens, n):
    scores = bm25.get_scores(claim_tokens)
    percentile_threshold = np.percentile(scores, 99)
    relevant_evidence = [(index, score) for index, score in enumerate(scores) if score >= percentile_threshold]
    relevant_evidence.sort(key=lambda x: x[1], reverse = True)
    return relevant_evidence[:n]

In [15]:
max_num_evidence = 20

relevant_evidences = []
for index, claim_tokens in enumerate(tokenized_claims):
    relevant_evidences.append(choose_n_evidences(bm25, claim_tokens, max_num_evidence))
    

In [16]:
relevant_evidences

[[(526183, 26.206844273820433),
  (788566, 26.160131610586376),
  (584172, 25.963715977382634),
  (670726, 25.712559160225144),
  (1198526, 25.648114107016276),
  (1003150, 25.230638841963756),
  (451863, 25.121788104514557),
  (714276, 24.935191077332263),
  (822714, 24.91611764994702),
  (467295, 24.32346836758795),
  (430936, 24.251235350546864),
  (767938, 23.99808973792848),
  (929751, 23.573446512348518),
  (476445, 23.51033947661372),
  (552698, 23.42358204438583),
  (957943, 23.415778465445264),
  (841259, 23.388939678790283),
  (96522, 23.18861697015283),
  (529275, 23.176268184348714),
  (1184180, 22.40610396176519)],
 [(535648, 19.332682445360295),
  (649842, 19.026697495426962),
  (485097, 18.58086562862452),
  (280172, 18.25280278349097),
  (722104, 17.90462910778604),
  (218469, 17.812148114113437),
  (604139, 17.617236517720094),
  (1191975, 17.259129062961414),
  (605938, 17.086359933753137),
  (720078, 17.08633037643215),
  (1091507, 16.89493313732451),
  (1062291, 16.

In [4]:
import csv
with open('data/relevant_evidences_bm25.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(relevant_evidences)

NameError: name 'relevant_evidences' is not defined

In [5]:
pred_evi_df = pd.read_csv("data/relevant_evidences_bm25.csv", header=None)
pred_evi_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,"(526183, 26.206844273820433)","(788566, 26.160131610586376)","(584172, 25.963715977382634)","(670726, 25.712559160225144)","(1198526, 25.648114107016276)","(1003150, 25.230638841963756)","(451863, 25.121788104514557)","(714276, 24.935191077332263)","(822714, 24.91611764994702)","(467295, 24.32346836758795)","(430936, 24.251235350546864)","(767938, 23.99808973792848)","(929751, 23.573446512348518)","(476445, 23.51033947661372)","(552698, 23.42358204438583)","(957943, 23.415778465445264)","(841259, 23.388939678790283)","(96522, 23.18861697015283)","(529275, 23.176268184348714)","(1184180, 22.40610396176519)"
1,"(535648, 19.332682445360295)","(649842, 19.026697495426962)","(485097, 18.58086562862452)","(280172, 18.25280278349097)","(722104, 17.90462910778604)","(218469, 17.812148114113437)","(604139, 17.617236517720094)","(1191975, 17.259129062961414)","(605938, 17.086359933753137)","(720078, 17.08633037643215)","(1091507, 16.89493313732451)","(1062291, 16.54660733219582)","(1304, 16.263198010948127)","(535236, 16.25174471390712)","(87025, 15.999491582529904)","(82455, 15.982621808007664)","(948429, 15.929082237747393)","(255963, 15.892656727154087)","(236743, 15.813910470011685)","(1123839, 15.791691034516607)"
2,"(354406, 18.634963010759726)","(32714, 16.247503078996402)","(420340, 15.852583356499556)","(890448, 14.582380120964848)","(938419, 14.335063971506692)","(30117, 13.400735462393865)","(326168, 12.945648333031855)","(84931, 12.905511429521301)","(667018, 12.905511429521301)","(945995, 12.824541033488142)","(132606, 12.750984304123543)","(1072698, 12.522335597687775)","(298575, 12.516839390743808)","(785930, 12.462659134155064)","(397274, 12.390060980670238)","(172207, 12.349904482650121)","(61934, 12.348885362756645)","(1185892, 12.29587116847373)","(171463, 12.295853463740183)","(586770, 12.136215827141132)"
3,"(954043, 30.51442560963188)","(929472, 29.861829955905037)","(1093889, 27.964347901118657)","(448166, 27.902655664893217)","(782003, 27.87629469524091)","(268594, 27.59982896611441)","(352533, 27.425207174322487)","(1007327, 27.077269998507504)","(768563, 26.93642904131319)","(98065, 26.514421003355825)","(1204969, 26.463937307292802)","(230540, 26.447523521408783)","(877251, 25.987082670006966)","(1037233, 25.65613826496373)","(863017, 25.164593840154325)","(430024, 24.989488887583818)","(352459, 24.933156916005355)","(217615, 24.527047119883317)","(515015, 24.355506973899956)","(264112, 24.079887992562902)"
4,"(343384, 25.86951425469598)","(398073, 24.23868040410204)","(1168278, 22.699153688833572)","(498022, 22.193229219668563)","(898144, 21.96407929600911)","(813869, 21.75176807436558)","(507268, 21.025358343899292)","(70804, 20.655715694599188)","(265950, 19.997891464335254)","(1042093, 19.53196762280783)","(429819, 19.513595989236375)","(1076031, 19.50964400236782)","(473260, 19.487000577880696)","(988380, 19.37713762942768)","(246789, 19.24088987872951)","(402596, 19.186753882372436)","(914114, 18.839949820103868)","(859676, 18.692819969563647)","(828090, 18.680766318619263)","(740292, 18.671073766478408)"


In [83]:
pred_evi_df.shape

(1228, 20)

In [6]:
import ast
pred_evi = []
for item in pred_evi_df.values:
    pred_evi_id = []
    for tup in item:
        pred_evi_id.append("evidence-" + str(ast.literal_eval(tup)[0]))
    pred_evi.append(pred_evi_id)

In [85]:
pred_evi

[['evidence-526183',
  'evidence-788566',
  'evidence-584172',
  'evidence-670726',
  'evidence-1198526',
  'evidence-1003150',
  'evidence-451863',
  'evidence-714276',
  'evidence-822714',
  'evidence-467295',
  'evidence-430936',
  'evidence-767938',
  'evidence-929751',
  'evidence-476445',
  'evidence-552698',
  'evidence-957943',
  'evidence-841259',
  'evidence-96522',
  'evidence-529275',
  'evidence-1184180'],
 ['evidence-535648',
  'evidence-649842',
  'evidence-485097',
  'evidence-280172',
  'evidence-722104',
  'evidence-218469',
  'evidence-604139',
  'evidence-1191975',
  'evidence-605938',
  'evidence-720078',
  'evidence-1091507',
  'evidence-1062291',
  'evidence-1304',
  'evidence-535236',
  'evidence-87025',
  'evidence-82455',
  'evidence-948429',
  'evidence-255963',
  'evidence-236743',
  'evidence-1123839'],
 ['evidence-354406',
  'evidence-32714',
  'evidence-420340',
  'evidence-890448',
  'evidence-938419',
  'evidence-30117',
  'evidence-326168',
  'evidence

In [7]:
# Create DataFrame from the flattened list
pred_evi_df = pd.DataFrame({'pred_evidence': pred_evi})

In [8]:
pred_evi_df.head()

Unnamed: 0,pred_evidence
0,"[evidence-526183, evidence-788566, evidence-58..."
1,"[evidence-535648, evidence-649842, evidence-48..."
2,"[evidence-354406, evidence-32714, evidence-420..."
3,"[evidence-954043, evidence-929472, evidence-10..."
4,"[evidence-343384, evidence-398073, evidence-11..."


In [88]:
pred_evi_df.shape

(1228, 1)

In [16]:
pred_evi_df = pd.concat([df, pred_evi_df], axis = 1)
pred_evi_df.head()

Unnamed: 0,claim_id,claim_text,claim_label,evidences,pred_evidence
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1...","[evidence-526183, evidence-788566, evidence-58..."
1,claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]","[evidence-535648, evidence-649842, evidence-48..."
2,claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]","[evidence-354406, evidence-32714, evidence-420..."
3,claim-2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5...","[evidence-954043, evidence-929472, evidence-10..."
4,claim-2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72...","[evidence-343384, evidence-398073, evidence-11..."


**Use NER**

In [9]:
import spacy
spacy_nlp = spacy.load('en_core_web_sm')

with open('data/final_ner.json', 'r') as file:
    ner_data = json.loads(file.read())


In [20]:
ner_df = []
hurdle = 500

for claim_id, claim_text in df[['claim_id', 'claim_text']].itertuples(index=False):
    doc = spacy_nlp(claim_text)
    for ent in doc.ents:
        key = ent.text.lower().translate(remove_punc)
        if ent.label_ in ner_data and key in ner_data[ent.label_]:
            if len(ner_data[ent.label_][key]) <= hurdle:
                ner_df.extend([{"claim_id": claim_id, "evidence_id": evidence} for evidence in ner_data[ent.label_][key]])

In [22]:
ner_df = pd.DataFrame(ner_df)


In [23]:
ner_df.iloc[:50]

Unnamed: 0,claim_id,evidence_id
0,claim-2021,evidence-230669
1,claim-2021,evidence-954557
2,claim-949,evidence-177
3,claim-949,evidence-408
4,claim-949,evidence-818
5,claim-949,evidence-2137
6,claim-949,evidence-3198
7,claim-949,evidence-6636
8,claim-949,evidence-8015
9,claim-949,evidence-18368


In [14]:
ner_df.shape

(30043, 2)

In [17]:
ner_relevant_df = pd.DataFrame(columns=['claim-id', 'evidence'])
for claim_id, pred_evi in pred_evi_df[['claim_id', 'pred_evidence']].values:
    for evi in pred_evi:
        ner_relevant_df = ner_relevant_df.append({'claim_id': claim_id, 'evidence': evi}, ignore_index = True)

ner_relevant_df = pd.concat([ner_relevant_df, ner_df]).reset_index(drop=True).rename({'claim': "claim_id", "evidence": 'evidence_id'}, axis = 1)

  ner_relevant_df = ner_relevant_df.append({'claim_id': claim_id, 'evidence': evi}, ignore_index = True)
  ner_relevant_df = ner_relevant_df.append({'claim_id': claim_id, 'evidence': evi}, ignore_index = True)
  ner_relevant_df = ner_relevant_df.append({'claim_id': claim_id, 'evidence': evi}, ignore_index = True)
  ner_relevant_df = ner_relevant_df.append({'claim_id': claim_id, 'evidence': evi}, ignore_index = True)
  ner_relevant_df = ner_relevant_df.append({'claim_id': claim_id, 'evidence': evi}, ignore_index = True)
  ner_relevant_df = ner_relevant_df.append({'claim_id': claim_id, 'evidence': evi}, ignore_index = True)
  ner_relevant_df = ner_relevant_df.append({'claim_id': claim_id, 'evidence': evi}, ignore_index = True)
  ner_relevant_df = ner_relevant_df.append({'claim_id': claim_id, 'evidence': evi}, ignore_index = True)
  ner_relevant_df = ner_relevant_df.append({'claim_id': claim_id, 'evidence': evi}, ignore_index = True)
  ner_relevant_df = ner_relevant_df.append({'claim_id':

In [18]:
ner_relevant_df.iloc[:,1:-1].head()

Unnamed: 0,evidence_id,claim_id
0,evidence-526183,claim-1937
1,evidence-788566,claim-1937
2,evidence-584172,claim-1937
3,evidence-670726,claim-1937
4,evidence-1198526,claim-1937


In [104]:
ner_relevant_df.iloc[:,1:-1].shape

(54603, 2)

In [105]:
ner_relevant_df.iloc[:,1:-1].to_csv("data/ner_pred.csv", index= False)

In [106]:
ner_data = pd.read_csv("data/ner_pred.csv")
ner_data.iloc[:50]

Unnamed: 0,evidence_id,claim_id
0,evidence-526183,claim-1937
1,evidence-788566,claim-1937
2,evidence-584172,claim-1937
3,evidence-670726,claim-1937
4,evidence-1198526,claim-1937
5,evidence-1003150,claim-1937
6,evidence-451863,claim-1937
7,evidence-714276,claim-1937
8,evidence-822714,claim-1937
9,evidence-467295,claim-1937
