<a href="https://colab.research.google.com/github/rohitdutta2510/Claim-Span-identification-using-LLMs/blob/main/DataProcessing_json.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW, get_linear_schedule_with_warmup, pipeline
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import ast
from tqdm import tqdm

In [None]:
PATH = '/content/drive/MyDrive/MTP/english.json'

In [None]:
with open(PATH) as file:
  data = json.load(file)

In [None]:
claims, text_tokens = [], []
for row in data:
  claims.append(row['claims'])
  text_tokens.append(row['text_tokens'])

claim_dict = {'claims' : claims, 'tokens' : text_tokens}
df = pd.DataFrame(claim_dict)
df.head()

Unnamed: 0,claims,tokens
0,[],"[@Troyen121, @wellcometrust, @BBCHARDtalk, @Je..."
1,"[{'index': 0, 'start': 6, 'end': 17, 'terms': ...","[Vaccine, Myth, Buster, :, Contraindication, :..."
2,"[{'index': 0, 'start': 0, 'end': 26, 'terms': ...","[Turkey, enters, the, history, books, as, one,..."
3,"[{'index': 0, 'start': 5, 'end': 24, 'terms': ...","[It, ’, s, 2022, ., A, cheap, ,, effective, va..."
4,"[{'index': 0, 'start': 8, 'end': 18, 'terms': ...","[@kapekaya, Wow, ,, that, is, some, feat, ,, t..."


In [None]:
df.shape

(7999, 2)

In [None]:
start_indices = []
end_indices = []
claim_label = []

for i in range(df.shape[0]):
  start = []
  end = []
  if df.iloc[i]['claims'] != []:
    claim_label.append(1)
    for claim in df.iloc[i]['claims']:
      start.append(claim['start'])
      end.append(claim['end'])
  else:
    claim_label.append(0)

  start_indices.append(start)
  end_indices.append(end)

df['span_start_index'] = start_indices
df['span_end_index'] = end_indices
df['claim_label'] = claim_label

In [None]:
df.head()

Unnamed: 0,claims,tokens,span_start_index,span_end_index,claim_label
0,[],"[@Troyen121, @wellcometrust, @BBCHARDtalk, @Je...",[],[],0
1,"[{'index': 0, 'start': 6, 'end': 17, 'terms': ...","[Vaccine, Myth, Buster, :, Contraindication, :...","[6, 20]","[17, 32]",1
2,"[{'index': 0, 'start': 0, 'end': 26, 'terms': ...","[Turkey, enters, the, history, books, as, one,...",[0],[26],1
3,"[{'index': 0, 'start': 5, 'end': 24, 'terms': ...","[It, ’, s, 2022, ., A, cheap, ,, effective, va...","[5, 33]","[24, 57]",1
4,"[{'index': 0, 'start': 8, 'end': 18, 'terms': ...","[@kapekaya, Wow, ,, that, is, some, feat, ,, t...",[8],[18],1


In [None]:
df = df[df['claim_label'] == 1]
df = df.drop(columns = ['claims'], axis = 1)
df.head()

Unnamed: 0,tokens,span_start_index,span_end_index,claim_label
1,"[Vaccine, Myth, Buster, :, Contraindication, :...","[6, 20]","[17, 32]",1
2,"[Turkey, enters, the, history, books, as, one,...",[0],[26],1
3,"[It, ’, s, 2022, ., A, cheap, ,, effective, va...","[5, 33]","[24, 57]",1
4,"[@kapekaya, Wow, ,, that, is, some, feat, ,, t...",[8],[18],1
5,"[How, so, not, -, fascist, of, Newsome, :, vac...",[32],[45],1


In [None]:
SAVE_PATH = '/content/drive/MyDrive/MTP/claims_processed.csv'

df.to_csv(SAVE_PATH)

In [None]:
new_rows = []

for index, row in df.iterrows():
    if len(row['span_start_index']) > 1:
        for i in range(len(row['span_start_index'])):
            new_row = row.copy()
            new_row['span_start_index'] = [row['span_start_index'][i]]
            new_row['span_end_index'] = [row['span_end_index'][i]]
            new_rows.append(new_row)
    else:
        new_rows.append(row)

new_df = pd.DataFrame(new_rows)

In [None]:
new_df.head()

Unnamed: 0,tokens,span_start_index,span_end_index,claim_label
1,"[Vaccine, Myth, Buster, :, Contraindication, :...",[6],[17],1
1,"[Vaccine, Myth, Buster, :, Contraindication, :...",[20],[32],1
2,"[Turkey, enters, the, history, books, as, one,...",[0],[26],1
3,"[It, ’, s, 2022, ., A, cheap, ,, effective, va...",[5],[24],1
3,"[It, ’, s, 2022, ., A, cheap, ,, effective, va...",[33],[57],1


In [None]:
# df = df[df['span_start_index'].apply(len) == 1]
# df.head()

In [None]:
# merged_df = pd.concat([df, new_df], ignore_index=True)
# merged_df.head()

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(new_df, test_size=0.3, random_state=42)
test, val = train_test_split(test, test_size=0.5, random_state=42)

TRAIN_PATH = '/content/drive/MyDrive/MTP/Dataset/claims_processed_train.csv'
TEST_PATH = '/content/drive/MyDrive/MTP/Dataset/claims_processed_test.csv'
VAL_PATH = '/content/drive/MyDrive/MTP/Dataset/claims_processed_val.csv'

train.to_csv(TRAIN_PATH)
test.to_csv(TEST_PATH)
val.to_csv(VAL_PATH)

In [None]:
train.shape

(5973, 4)

In [None]:
test.shape

(1280, 4)

In [None]:
val.shape

(1281, 4)