In [13]:
pip install transformers==4.20.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
import pandas as pd
import numpy as np
import math 
from sklearn.model_selection import train_test_split
import torch
import torch.nn
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, TensorDataset

import logging
logging.basicConfig(level=logging.ERROR)
from torch.nn.utils.rnn import pad_sequence

In [15]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)# to use gpu if possible

cpu


In [16]:
import json
import networkx as nx

# Read the JSONL file and parse the data
def read_fever_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# Create the knowledge graph
def create_knowledge_graph(data):
    kg = nx.DiGraph()
    
    for item in data:
        claim_id = item['id']
        claim_text = item['claim']
        label = item['label']
        
        # Add claim node
        kg.add_node(claim_id, label="claim", text=claim_text)
        
        if label != "NOT ENOUGH INFO":
            for evidence_group in item['evidence']:
                for evidence in evidence_group:
                    evidence_id = evidence[1]
                    evidence_title = evidence[2]
                    evidence_sentence_num = evidence[3]
                    
                    # Add evidence node
                    kg.add_node(evidence_id, label="evidence", title=evidence_title, sentence_num=evidence_sentence_num)
                    
                    # Add edge between claim and evidence with the relationship label
                    kg.add_edge(claim_id, evidence_id, label=label)
    
    return kg

file_path = "/content/drive/MyDrive/NAM/train.jsonl"
data = read_fever_jsonl(file_path)
knowledge_graph = create_knowledge_graph(data)

In [17]:
print("Nodes in the knowledge graph:", knowledge_graph.number_of_nodes())
print("Edges in the knowledge graph:", knowledge_graph.number_of_edges())

Nodes in the knowledge graph: 268910
Edges in the knowledge graph: 221476


In [18]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
def find_top_k_evidences(kg, claim_id, k=5):
    edges = [(evidence_id, data['label']) for _, evidence_id, data in kg.out_edges(claim_id, data=True)]
    evidences = sorted(edges, key=lambda x: x[1], reverse=True)[:k]
    return evidences

In [21]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

def prepare_dataset(kg):
    dataset = []
    for claim_id, data in kg.nodes(data=True):
        if data['label'] == 'claim':
            claim_text = preprocess_text(data['text'])
            evidences = find_top_k_evidences(kg, claim_id)
            for evidence_id, relationship in evidences:
                evidence_data = kg.nodes[evidence_id]
                evidence_text = preprocess_text(evidence_data['title'])
                dataset.append((claim_text, evidence_text, relationship))
    return dataset

dataset = prepare_dataset(knowledge_graph)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:
def extract_features(dataset):
    vectorizer = TfidfVectorizer()
    claims, evidences, labels = zip(*dataset)
    claim_features = vectorizer.fit_transform(claims)
    evidence_features = vectorizer.transform(evidences)
    return claim_features, evidence_features, labels, vectorizer

claim_features, evidence_features, labels, vectorizer = extract_features(dataset)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

X = hstack([claim_features, evidence_features])
y = [1 if label == "SUPPORTS" else 0 for label in labels]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.8110532407407407
Precision: 0.8108741504569955
Recall: 0.9699484189280108
F1 Score: 0.8833065277884148
