In [2]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split


In [3]:
full_fev_dev = '/home/ashrafs/projects/dragon/data/fever/full/feverous_dev_labelled_statements.jsonl'
full_fev_train = '/home/ashrafs/projects/dragon/data/fever/full/feverous_train_labelled_statements.jsonl'

In [23]:
dev_fever_output = '/home/ashrafs/projects/dragon/data/fever/Main/dev.jsonl'
train_fever_output = '/home/ashrafs/projects/dragon/data/fever/Main/train.jsonl'
test_fever_output = '/home/ashrafs/projects/dragon/data/fever/Main/test.jsonl'



In [4]:
dev_fever_org = '/home/ashrafs/projects/dragon/data/fever/orginal_splits/dev.jsonl'
train_fever_org = '/home/ashrafs/projects/dragon/data/fever/orginal_splits/train.jsonl'
test_fever_org = '/home/ashrafs/projects/dragon/data/fever/orginal_splits/test.jsonl'



In [5]:
def load_data(filename):
    data = []
    with open(filename, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data


In [17]:
# Load the data

dev_data = load_data(full_fev_dev)
train_data = load_data(full_fev_train)



In [18]:
print("Dev Claims: ",len(dev_data), "Train Claims: ",  len(train_data))

Dev Claims:  7890 Train Claims:  71291


In [8]:
train_data[0]

{'CLAIM': 'Michael Folivi competed with ten teams from 2016 to 2021, appearing in 54 games and making seven goals in total.',
 'ENTITY_STATEMENTS': None,
 'LABEL': 'REFUTES',
 'id': 24435,
 'label': 'REFUTES'}

In [32]:
for i, data in enumerate(train_data):
    if i > 5:
        break
    print(data.get("CLAIM"))
    print("LABEL: ", data.get("LABEL"), " ID: ", data.get("id"))
    print("----------")
    

Michael Folivi competed with ten teams from 2016 to 2021, appearing in 54 games and making seven goals in total.
LABEL:  REFUTES  ID:  24435
----------
Asiatic Society of Bangladesh(housed in Nimtali) is a non political organization renamed in 1972, Ahmed Hasan Dani played an important role in its founding.
LABEL:  SUPPORTS  ID:  14802
----------
Lindfield railway station has 3 bus routes, in which the first platform services routes to Emu plains via Central and Richmond and Hornbys via Strathfield.
LABEL:  SUPPORTS  ID:  28540
----------
Mukaradeeb('Wolf's Den') is a city in Iraq near the Syrian border, in the district of Al-Qa'im, province of Al-Anbar.
LABEL:  SUPPORTS  ID:  71874
----------
Herbivore men was coined by Maki Fukasawa and was a negative connotation as as young men who had lost their "manliness" and were responsible for Japan's declining birth rate which is actually due to high life expectancy and low fertility rate  among other things.
LABEL:  SUPPORTS  ID:  70296
----

In [33]:
def convert_to_fever_format(data_list):
    fever_format_data = []
    label_to_key = {"SUPPORTS": "A", "REFUTES": "B", "NOT ENOUGH INFO": "C"}
    placeholder_text = "which:"

    for data in data_list:
        claim_id = data.get("id")
        label = data.get("LABEL")

        fever_item = {
            "id": claim_id,
            "question": {
                "stem": f"{data.get('CLAIM')}, {placeholder_text}",
                "choices": [
                    {"text": "SUPPORTS", "label": "A"},
                    {"text": "REFUTES", "label": "B"},
                    {"text": "NOT ENOUGH INFO", "label": "C"},
                ]
            },
            "answerKey": label_to_key.get(label)  # Default to "C" if label is not found
        }

        fever_format_data.append(fever_item)

    return fever_format_data



In [34]:
converted_dev= convert_to_fever_format(dev_data)
converted_train= convert_to_fever_format(train_data)

In [37]:
print(converted_train[0])

{'id': 24435, 'question': {'stem': 'Michael Folivi competed with ten teams from 2016 to 2021, appearing in 54 games and making seven goals in total., which:', 'choices': [{'text': 'SUPPORTS', 'label': 'A'}, {'text': 'REFUTES', 'label': 'B'}, {'text': 'NOT ENOUGH INFO', 'label': 'C'}]}, 'answerKey': 'B'}


In [11]:
def save_data_as_jsonl(data_subset, jsonl_file_path):
    with open(jsonl_file_path, 'w', encoding='utf-8') as file:
        for item in data_subset:
            json.dump(item, file)
            file.write('\n')  # Write a newline character after each JSON object


In [None]:
 
dev_data, test_data = train_test_split(converted_dev, test_size=0.5, random_state=42)

# Save each subset to its corresponding jsonl file
save_data_as_jsonl(converted_train, train_fever_output)
save_data_as_jsonl(dev_data, dev_fever_output)
save_data_as_jsonl(test_data, test_fever_output)


In [16]:
def convert_orginal_format(data_list):
    fever_format_data = []

    for data in data_list:
        claim_id = data.get("id")
        label = data.get("LABEL")
        claim = data.get('CLAIM')

        fever_item = {
            "id": claim_id,
            "label": label,
            "claim": claim
        }

        fever_format_data.append(fever_item)

    return fever_format_data
ctrain_data = convert_orginal_format(train_data)
cdev_data = convert_orginal_format(dev_data)

dev_set, test_set = train_test_split(cdev_data, test_size=0.5, random_state=42)

# Save each subset to its corresponding jsonl file
save_data_as_jsonl(ctrain_data, train_fever_org)
save_data_as_jsonl(dev_set, dev_fever_org)
save_data_as_jsonl(test_set, test_fever_org)
    