In [1]:
import pandas as pd
import json
import re

import string
remove_punc = str.maketrans('', '', string.punctuation)

**Create new Data Frame**

In [2]:
dev_claim_file = "data/train-claims.json"
data = []
with open(dev_claim_file, "r") as file:
    data = json.load(file)

rows = []
for claim_id, item in data.items():
    row = {'claim_id': claim_id}
    row.update(item)
    rows.append(row)

df = pd.DataFrame(rows)
df.shape

(1228, 4)

In [3]:
df.head()

Unnamed: 0,claim_id,claim_text,claim_label,evidences
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,"[evidence-442946, evidence-1194317, evidence-1..."
1,claim-126,El Niño drove record highs in global temperatu...,REFUTES,"[evidence-338219, evidence-1127398]"
2,claim-2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[evidence-530063, evidence-984887]"
3,claim-2021,Weather Channel co-founder John Coleman provid...,DISPUTED,"[evidence-1177431, evidence-782448, evidence-5..."
4,claim-2449,"""January 2008 capped a 12 month period of glob...",NOT_ENOUGH_INFO,"[evidence-1010750, evidence-91661, evidence-72..."


In [4]:
exploded_df = df.explode('evidences')
exploded_df

Unnamed: 0,claim_id,claim_text,claim_label,evidences
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-442946
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-1194317
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-12171
1,claim-126,El Niño drove record highs in global temperatu...,REFUTES,evidence-338219
1,claim-126,El Niño drove record highs in global temperatu...,REFUTES,evidence-1127398
...,...,...,...,...
1226,claim-502,But abnormal temperature spikes in February an...,NOT_ENOUGH_INFO,evidence-583187
1227,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,evidence-971105
1227,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,evidence-457769
1227,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,evidence-298971


In [5]:
exploded_df.reset_index(drop=True, inplace=True)
exploded_df

Unnamed: 0,claim_id,claim_text,claim_label,evidences
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-442946
1,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-1194317
2,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-12171
3,claim-126,El Niño drove record highs in global temperatu...,REFUTES,evidence-338219
4,claim-126,El Niño drove record highs in global temperatu...,REFUTES,evidence-1127398
...,...,...,...,...
4117,claim-502,But abnormal temperature spikes in February an...,NOT_ENOUGH_INFO,evidence-583187
4118,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,evidence-971105
4119,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,evidence-457769
4120,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,evidence-298971


In [6]:
exploded_df.shape

(4122, 4)

In [3]:
evidence_file_path = "data/evidence.json"
evidence = []
with open(evidence_file_path, "r") as file:
    evidence = json.load(file)

evidence


{'evidence-0': 'John Bennet Lawes, English entrepreneur and agricultural scientist',
 'evidence-1': 'Lindberg began his professional career at the age of 16, eventually moving to New York City in 1977.',
 'evidence-2': "``Boston (Ladies of Cambridge)'' by Vampire Weekend",
 'evidence-3': 'Gerald Francis Goyer (born October 20, 1936) was a professional ice hockey player who played 40 games in the National Hockey League.',
 'evidence-4': 'He detected abnormalities of oxytocinergic function in schizoaffective mania, post-partum psychosis and how ECT modified oxytocin release.',
 'evidence-5': 'With peak winds of 110 mph (175 km/h) and a minimum pressure of 972 mbar (hPa ; 28.71 inHg), Florence was the strongest storm of the 1994 Atlantic hurricane season.',
 'evidence-6': 'He is currently a professor of piano at the University of Wisconsin -- Madison since August 2000.',
 'evidence-7': 'In addition to known and tangible risks, unforeseeable black swan extinction events may occur, presenti

In [4]:
evidence_rows = []
for evidence_id, item in evidence.items():
    row = {'evidence_id': evidence_id, 'text': item}
    evidence_rows.append(row)

evidence_df = pd.DataFrame(evidence_rows)
evidence_df.head()



Unnamed: 0,evidence_id,text
0,evidence-0,"John Bennet Lawes, English entrepreneur and ag..."
1,evidence-1,Lindberg began his professional career at the ...
2,evidence-2,``Boston (Ladies of Cambridge)'' by Vampire We...
3,evidence-3,"Gerald Francis Goyer (born October 20, 1936) w..."
4,evidence-4,He detected abnormalities of oxytocinergic fun...


In [9]:
print(evidence_df.shape)

(1208827, 2)


In [20]:
import nltk
nltk.download('words')

english_words = set(nltk.corpus.words.words())

def remove_non_english_evidence(text):
    tokens = nltk.word_tokenize(text)
    return any(token.lower() not in english_words for token in tokens)

def remain_english_words(text):
    tokens = nltk.word_tokenize(text)
    return " ".join([token for token in tokens if token.lower() in english_words])



[nltk_data] Downloading package words to
[nltk_data]     C:\Users\rishe\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [29]:
evi_df = pd.read_json(evidence_file_path, orient="index").reset_index().rename({0: "evidence", "index": "evi_id"}, axis=1)
evi_df.head()

Unnamed: 0,evi_id,evidence
0,evidence-0,"John Bennet Lawes, English entrepreneur and ag..."
1,evidence-1,Lindberg began his professional career at the ...
2,evidence-2,``Boston (Ladies of Cambridge)'' by Vampire We...
3,evidence-3,"Gerald Francis Goyer (born October 20, 1936) w..."
4,evidence-4,He detected abnormalities of oxytocinergic fun...


In [30]:
evi_df = evi_df[evi_df["evidence"].apply(remove_non_english_evidence)]

In [33]:
evi_df

Unnamed: 0,evi_id,evidence
0,evidence-0,"John Bennet Lawes, English entrepreneur and ag..."
1,evidence-1,Lindberg began his professional career at the ...
2,evidence-2,``Boston (Ladies of Cambridge)'' by Vampire We...
3,evidence-3,"Gerald Francis Goyer (born October 20, 1936) w..."
4,evidence-4,He detected abnormalities of oxytocinergic fun...
...,...,...
1208822,evidence-1208822,Also on the property is a contributing garage ...
1208823,evidence-1208823,| class = ``fn org'' | Fyrde | | | | 6110 | | ...
1208824,evidence-1208824,"Dragon Storm (game), a role-playing game and c..."
1208825,evidence-1208825,It states that the Zeriuani ``which is so grea...


In [38]:
evi_df['evidence'] = evi_df['evidence'].apply(remain_english_words)

In [41]:
evi_df.iloc[100:110]

Unnamed: 0,evi_id,evidence
100,evidence-100,The sauce may be depending on where the Michig...
101,evidence-101,novel a novel by Goodman
102,evidence-102,The increasing light pollution due to the grow...
103,evidence-103,is the third studio album by singer first on b...
104,evidence-104,All the in the are
105,evidence-105,Blue Eyed is a documentary film by Bertram in ...
106,evidence-106,Starring Simple in the lead and music composed by
107,evidence-107,His grandson is now carrying forward the legac...
108,evidence-108,At the time it was known as the Year of the Co...
109,evidence-109,This is a list of for in science and more gene...


In [40]:
evi_df.shape

(1208177, 2)

In [42]:
evi_df.to_csv("data/filtered_evidence.csv", index = False)

In [11]:
evidence_id_list = []
for i in exploded_df['evidences']:
    evidence_id_list.append(evi_df[evi_df['evi_id'] == i]['evidence'].iloc[0])

new_evidence_df = pd.DataFrame(evidence_id_list, columns=['evidence text'])

In [12]:
new_evidence_df.head()

Unnamed: 0,evidence text
0,At very high concentrations (100 times atmosph...
1,Plants can grow as much as 50 percent faster i...
2,Higher carbon dioxide concentrations will favo...
3,While ‘climate change’ can be due to natural f...
4,This acceleration is due mostly to human-cause...


In [13]:
new_evidence_df.shape

(4122, 1)

In [14]:
new_claim_df = pd.concat([exploded_df, new_evidence_df], axis = 1)
new_claim_df

Unnamed: 0,claim_id,claim_text,claim_label,evidences,evidence text
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-442946,At very high concentrations (100 times atmosph...
1,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-1194317,Plants can grow as much as 50 percent faster i...
2,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-12171,Higher carbon dioxide concentrations will favo...
3,claim-126,El Niño drove record highs in global temperatu...,REFUTES,evidence-338219,While ‘climate change’ can be due to natural f...
4,claim-126,El Niño drove record highs in global temperatu...,REFUTES,evidence-1127398,This acceleration is due mostly to human-cause...
...,...,...,...,...,...
4117,claim-502,But abnormal temperature spikes in February an...,NOT_ENOUGH_INFO,evidence-583187,The coastline sees significantly mild temperat...
4118,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,evidence-971105,"Dielectric heating, also known as electronic h..."
4119,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,evidence-457769,An example is absorption or emission of radio ...
4120,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,evidence-298971,"Water, fat, and other substances in the food a..."


In [15]:
new_claim_df.drop(columns=['evidences'])

Unnamed: 0,claim_id,claim_text,claim_label,evidence text
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,At very high concentrations (100 times atmosph...
1,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,Plants can grow as much as 50 percent faster i...
2,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,Higher carbon dioxide concentrations will favo...
3,claim-126,El Niño drove record highs in global temperatu...,REFUTES,While ‘climate change’ can be due to natural f...
4,claim-126,El Niño drove record highs in global temperatu...,REFUTES,This acceleration is due mostly to human-cause...
...,...,...,...,...
4117,claim-502,But abnormal temperature spikes in February an...,NOT_ENOUGH_INFO,The coastline sees significantly mild temperat...
4118,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,"Dielectric heating, also known as electronic h..."
4119,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,An example is absorption or emission of radio ...
4120,claim-3093,Sending oscillating microwaves from an antenna...,SUPPORTS,"Water, fat, and other substances in the food a..."


In [16]:
new_claim_df.to_csv('new-claim-evidence.csv')

**Spacy**

In [44]:
import spacy
nlp = spacy.load("en_core_web_sm")
from collections import defaultdict
ner_map = defaultdict(lambda: defaultdict(list))

In [45]:
LOWER = 0
UPPER = 1210
file_num = 1
BATCH_SIZE = 1000

for epoch in range(LOWER, UPPER):
    cur_data = evi_df.loc[epoch*BATCH_SIZE:(epoch+1)*BATCH_SIZE].values

    for id, evidence in cur_data:
        doc = nlp(evidence)
        for ent in doc.ents:
            ner_map[ent.label_][ent.text.lower().translate(remove_punc)].append(id)
    
    if epoch%50 == 0:
        file_name = "./data/" + str(file_num) + ".json"
        with open(file_name, 'w') as f:
            json.dump(ner_map, f)
        print(f"File saved: {file_name}, epoch: {epoch}")
        file_num += 1
    
    
file_name = "./data/" + str(file_num) + ".json"
with open(file_name, 'w') as f:
    json.dump(ner_map, f)
print(f"File saved: {file_name}")

File saved: ./data/1.json, epoch: 0
File saved: ./data/2.json, epoch: 50
File saved: ./data/3.json, epoch: 100
File saved: ./data/4.json, epoch: 150
File saved: ./data/5.json, epoch: 200
File saved: ./data/6.json, epoch: 250
File saved: ./data/7.json, epoch: 300
File saved: ./data/8.json, epoch: 350
File saved: ./data/9.json, epoch: 400
File saved: ./data/10.json, epoch: 450
File saved: ./data/11.json, epoch: 500
File saved: ./data/12.json, epoch: 550
File saved: ./data/13.json, epoch: 600
File saved: ./data/14.json, epoch: 650
File saved: ./data/15.json, epoch: 700
File saved: ./data/16.json, epoch: 750
File saved: ./data/17.json, epoch: 800
File saved: ./data/18.json, epoch: 850
File saved: ./data/19.json, epoch: 900
File saved: ./data/20.json, epoch: 950
File saved: ./data/21.json, epoch: 1000
File saved: ./data/22.json, epoch: 1050
File saved: ./data/23.json, epoch: 1100
File saved: ./data/24.json, epoch: 1150
File saved: ./data/25.json, epoch: 1200
File saved: ./data/26.json


In [46]:
path = "./data/"
import os

total_dict = defaultdict(lambda: defaultdict(list))
file_name = 1
while file_name <= 26:
    with open(path + str(file_name)+".json", "r") as current_file:
        current_dict = json.loads(current_file.read())
    
    for ner_key in current_dict.keys():
        for key in current_dict[ner_key].keys():
            total_dict[ner_key][key] += current_dict[ner_key][key]
    print(file_name)
    file_name += 1

final_file_name = "./data/final_ner.json"
with open(final_file_name, 'w') as f:
    json.dump(ner_map, f)
print(f"File saved: {final_file_name}")


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
File saved: ./data/final_ner.json


**Find relevant evidences**

In [43]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [20]:
tokenized_claims = [word_tokenize(claim.lower().translate(remove_punc)) for claim in df['claim_text']]

lemmatized_claims = []
for claim in tokenized_claims:
    lemmatized_claim = [lemmatizer.lemmatize(word) for word in claim if word not in stop_words]
    lemmatized_claims.append(lemmatized_claim)
print(len(tokenized_claims))
print(len(lemmatized_claims))

1228
1228


In [21]:
tokenized_evidences = [word_tokenize(evidence_text.lower().translate(remove_punc)) for evidence_text in evi_df['evidence']]

In [22]:
lemmatized_evidences = []
for evidence in tokenized_evidences:
    lemmatized_evidence = [lemmatizer.lemmatize(word) for word in evidence if word not in stop_words]
    lemmatized_evidences.append(lemmatized_evidence)
print(len(tokenized_evidences))
print(len(lemmatized_evidences))

KeyboardInterrupt: 

In [23]:
len(tokenized_evidences)

1208827

In [24]:
bm25 = BM25Okapi(lemmatized_evidences)

In [25]:
import numpy as np

In [None]:

for query in lemmatized_claims:
    scores = np.array(bm25.get_scores(query))
    percentile_threshold = np.percentile(scores, 99)
    relevant_evidence = [(index, lemmatized_evidences[index], score) for index, score in enumerate(scores) if score >= percentile_threshold]
    
    print(relevant_evidence[0])

(213, ['grassland', 'lower', 'elevation', 'forest', 'higher', 'elevation'], 7.8462932779924275)
(130, ['example', 'temperature', 'specific', 'volume', 'always', 'independent'], 7.730970787468927)


KeyboardInterrupt: 

In [None]:
sample_claim = "Extreme melting and changes to the climate like this has released pressure on to the continent, allowing the ground to rise up."
sample_tokens = [word_tokenize(sample_claim.lower().translate(remove_punc))]
print(sample_tokens)
sample_lemmatized_claim = [lemmatizer.lemmatize(word) for word in sample_claim if word not in stop_words]
bm25 = BM25Okapi(tokenized_evidences)
scores = bm25.get_scores(sample_tokens[0])
percentile_threshold = np.percentile(scores, 99)
relevant_evidence = [(index, tokenized_evidences[index], score) for index, score in enumerate(scores) if score >= percentile_threshold]

[['extreme', 'melting', 'and', 'changes', 'to', 'the', 'climate', 'like', 'this', 'has', 'released', 'pressure', 'on', 'to', 'the', 'continent', 'allowing', 'the', 'ground', 'to', 'rise', 'up']]


In [None]:
# print(len(relevant_evidence))
relevant_evidence = sorted(relevant_evidence, key = lambda x : x[2], reverse=True)
for elem in relevant_evidence[:40]:
    print(elem[0], elem[2])

169769 38.27316838094465
596388 37.75736176359133
56634 37.72956214106594
759563 36.81874701365423
122455 36.71299019746487
434765 36.16956822256668
937636 36.092673761295984
146256 35.614518736162196
107210 35.170497392365796
265135 35.149539251629065
2462 35.081483057823476
18263 34.44234993731739
1103001 34.367225334438785
1204872 34.126147968558534
754332 33.94660440636235
852366 33.94388615440573
343063 33.93022484783556
1147532 33.853272516589506
710636 33.770211987887635
406140 33.70680727162011
57280 33.65111751161397
978946 33.63671136413744
785293 33.60456963713895
665725 33.555923729003
405121 33.543146718590364
1134336 33.508967031188995
741666 33.50630631494984
987017 33.45637840423073
344321 33.39116805087777
363469 33.33248169041475
369342 33.329309756136745
64777 33.31937481080657
100920 33.09485670700442
739793 32.93889375284553
320984 32.85325528246066
706951 32.77657635966524
118923 32.69998994691953
865726 32.59333791911197
115287 32.56897774019351
906956 32.4400606