In [31]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import pandas as pd

In [32]:
dev_fever_data = '/home/ashrafs/projects/dragon/data/fever/Main/dev-split.jsonl'
train_fever_data = '/home/ashrafs/projects/dragon/data/fever/Main/train-split.jsonl'
test_fever_data = '/home/ashrafs/projects/dragon/data/fever/Main/test-split.jsonl'


In [33]:
dev_fever_output = '/home/ashrafs/projects/dragon/data/fever/Main/dev.jsonl'
train_fever_output = '/home/ashrafs/projects/dragon/data/fever/Main/train.jsonl'
test_fever_output = '/home/ashrafs/projects/dragon/data/fever/Main/test.jsonl'



In [34]:
def load_data(filename):
    data = []
    with open(filename, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data



In [35]:
# Load the data

dev_data = load_data(dev_fever_data)
train_data = load_data(train_fever_data)
test_data = load_data(test_fever_data)

# Now, data_list contains all the data from the JSONL file
# Count the claims
number_of_claims = len(dev_data)
number_of_claims


10

In [36]:
for data in dev_data:
    print(data.get("CLAIM"))
    print("LABEL: ", data.get("LABEL"), " ID: ", data.get("id"))
    print("----------")
    

Abraham Annan Adjei has never played professional football.
LABEL:  REFUTES  ID:  21641
----------
Rex Jory remained a copy boy until his retirement.
LABEL:  REFUTES  ID:  6293
----------
Italian film actor Peppe Lanzetta has appeared in films since his first, Blues metropolitano, in 1985 to his most recent, Due soldati, in 2017.
LABEL:  SUPPORTS  ID:  9220
----------
Two ships were named after U.S. Navy enlistee, Charles Ausburne, after he stood to duty when the Antilles,a ship that was chartered by the Army for troop transport, was torpedoed amd sunk by a German U-Boat, under him.
LABEL:  SUPPORTS  ID:  2021
----------
UA Valettoise was founded in 1948, is in the CFA 2 Group E league, and is based in the Provence-Alpes-Côte d'Azur region in southeastern France.
LABEL:  SUPPORTS  ID:  2117
----------
Juventus, colloquially known as Juve (pronounced [ˈjuːve), is a professional football club based in Turin, Piedmont, Italy, that competes in the Serie A, the top flight of Italian footbal

In [38]:
def convert_to_fever_format(data_list):
    fever_format_data = []
    label_to_key = {"SUPPORTS": "A", "REFUTES": "B", "NOT ENOUGH INFO": "C"}
    placeholder_text = "which:"

    for data in data_list:
        claim_id = data.get("id")
        label = data.get("LABEL")

        fever_item = {
            "id": claim_id,
            "question": {
                "stem": f"{data.get('CLAIM')}, {placeholder_text}",
                "choices": [
                    {"text": "SUPPORTS", "label": "A"},
                    {"text": "REFUTES", "label": "B"},
                    {"text": "NOT ENOUGH INFO", "label": "C"},
                ]
            },
            "answerKey": label_to_key.get(label)  # Default to "C" if label is not found
        }

        fever_format_data.append(fever_item)

    return fever_format_data



In [39]:
# Assuming `data_list` is your list of data
converted_dev= convert_to_fever_format(dev_data)
converted_train= convert_to_fever_format(train_data)
converted_test= convert_to_fever_format(test_data)

# Example of printing the first item in the converted format
print(converted_dev[0])


{'id': 21641, 'question': {'stem': 'Abraham Annan Adjei has never played professional football., which:', 'choices': [{'text': 'SUPPORTS', 'label': 'A'}, {'text': 'REFUTES', 'label': 'B'}, {'text': 'NOT ENOUGH INFO', 'label': 'C'}]}, 'answerKey': 'B'}


In [29]:
def save_data_as_jsonl(data_subset, jsonl_file_path):
    with open(jsonl_file_path, 'w') as file:
        for item in data_subset:
            json.dump(item, file)
            file.write('\n')  # Write a newline character after each JSON object




In [30]:
# Save the data subsets
save_data_as_jsonl(converted_dev, dev_fever_output)
save_data_as_jsonl(converted_train, train_fever_output)
save_data_as_jsonl(converted_test, test_fever_output)