In [None]:
import requests
import json
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification
from spacy import displacy
import spacy

In [None]:
# 1️⃣ Download Fake News Dataset (LIAR Dataset)
dataset = load_dataset("liar")

# Convert dataset to pandas for easy processing
df_train = pd.DataFrame(dataset["train"])
df_test = pd.DataFrame(dataset["test"])

In [None]:
# 2️⃣ Extract text & veracity labels
def preprocess_articles(df):
    return [{"text": row["statement"], "label": row["label"]} for _, row in df.iterrows()]

train_data = preprocess_articles(df_train)
test_data = preprocess_articles(df_test)

In [None]:
# 3️⃣ Named Entity Recognition (NER) using spaCy
nlp = spacy.load("en_core_web_sm")  # Open-source NER model

def extract_entities(text):
    doc = nlp(text)
    entities = {ent.text: ent.label_ for ent in doc.ents}  # Extract entities
    return entities

# Example NER extraction
sample_text = train_data[0]["text"]
print("NER Entities:", extract_entities(sample_text))

In [None]:
# 4️⃣ Query Diffbot Knowledge Graph for factual validation
DIFFBOT_TOKEN = "YOUR_DIFFBOT_API_KEY"

def query_diffbot(entity_name):
    url = f"https://kg.diffbot.com/kg/v3/enhance?token={DIFFBOT_TOKEN}&name={entity_name}"
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.text)
    return None

# Fetch factual data for each article
def enrich_with_kg(text):
    entities = extract_entities(text)
    knowledge_results = []
    for entity in entities.keys():
        kg_data = query_diffbot(entity)
        if kg_data:
            knowledge_results.append(json.dumps(kg_data))
    return text + " " + " ".join(knowledge_results)

In [None]:
# 5️⃣ Generate article + KG knowledge dataset
train_augmented = [{"text": enrich_with_kg(item["text"]), "label": item["label"]} for item in train_data]
test_augmented = [{"text": enrich_with_kg(item["text"]), "label": item["label"]} for item in test_data]

# Convert back to DataFrame
df_train_aug = pd.DataFrame(train_augmented)
df_test_aug = pd.DataFrame(test_augmented)

In [None]:
# 6️⃣ Train Deep Learning Model (DistilBERT for Fake News Detection)
MODEL_NAME = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=6)  # LIAR dataset has 6 labels

def tokenize_data(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [None]:
# Tokenize dataset
train_dataset = df_train_aug.map(tokenize_data)
test_dataset = df_test_aug.map(tokenize_data)

# Convert to PyTorch dataset
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./fake_news_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Save the model
trainer.save_model("./fake_news_model")
tokenizer.save_pretrained("./fake_news_model")

print("Fake News Detection Model Training Complete! 🚀")