In [1]:
!pip install -q "transformers>=4.41" datasets accelerate evaluate scikit-learn pandas sentencepiece
!pip install -q --upgrade "transformers>=4.41" accelerate datasets evaluate
!pip install -q gradio wikipedia vaderSentiment requests

from IPython.display import clear_output
clear_output()
print("All libraries installed ✅")

All libraries installed ✅


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
from datasets import Dataset, ClassLabel, Value

FAKE_PATH = '/content/drive/MyDrive/NewsFakeCOVID-19_5.csv'
REAL_PATH = '/content/drive/MyDrive/NewsRealCOVID-19_5.csv'

fake_df = pd.read_csv(FAKE_PATH).copy()
real_df = pd.read_csv(REAL_PATH).copy()

fake_df['label'] = 0  # 0 = Fake
real_df['label'] = 1  # 1 = Real

df = pd.concat([fake_df, real_df], ignore_index=True)
# Your files had 'content' as the text column:
if 'text' not in df.columns and 'content' in df.columns:
    df = df.rename(columns={'content': 'text'})
# Basic cleanups
df.dropna(subset=['text', 'label'], inplace=True)
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)  # shuffle
print("Columns:", df.columns.tolist())
print("Counts -> Fake(0):", (df['label']==0).sum(), "Real(1):", (df['label']==1).sum())
print("Total rows:", len(df))
# Hugging Face Dataset + split
dataset = Dataset.from_pandas(df[['text','label']])
# Cast the 'label' column to ClassLabel for stratification
features = dataset.features.copy()
features['label'] = ClassLabel(num_classes=2, names=['Fake', 'Real'])
dataset = dataset.cast(features)
train_test = dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column='label')
train_ds, eval_ds = train_test['train'], train_test['test']
print(f"Train: {len(train_ds)}  |  Eval: {len(eval_ds)}")

Columns: ['Unnamed: 0', 'type', 'fact_check_url', 'archieve', 'news_url', 'news_url2', 'news_url3', 'news_url4', 'news_url5', 'title', 'newstitle', 'text', 'abstract', 'publish_date', 'meta_keywords', 'label']
Counts -> Fake(0): 125 Real(1): 1397
Total rows: 1522


Casting the dataset:   0%|          | 0/1522 [00:00<?, ? examples/s]

Train: 1217  |  Eval: 305


In [None]:
from transformers import AutoTokenizer

# Use DeBERTa-v3:
MODEL_NAME = "microsoft/deberta-v3-base"

# If you want to keep DistilBERT for speed, uncomment the next line:
# MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 256
def tok(batch):
  return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=MAX_LEN)

tokenized_train = train_ds.map(tok, batched=True, remove_columns=['text'])
tokenized_eval = eval_ds.map(tok, batched=True, remove_columns=['text'])

tokenized_train = tokenized_train.with_format("torch")
tokenized_eval = tokenized_eval.with_format("torch")

print("Tokenization complete ✅")

In [None]:
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

# Labels mapping
id2label = {0: "Fake", 1: "Real"}
label2id = {"Fake": 0, "Real": 1}

# Load pretrained DeBERTa model
MODEL_NAME = "microsoft/deberta-v3-base"
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# Metrics
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": acc_metric.compute(predictions=preds, references=labels)['accuracy'],
        "f1": f1_metric.compute(predictions=preds, references=labels, average="macro")['f1'],
    }

# ✅ TrainingArguments (for old transformers → eval_strategy)
training_args = TrainingArguments(
    output_dir="/content/results",
    eval_strategy="steps",       # ✅ use this instead of evaluation_strategy
    save_steps=200,
    eval_steps=200,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    report_to="none",
    seed=42,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,   # <-- make sure you defined these
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Training
train_result = trainer.train()
print("Training finished ✅")


In [None]:
from sklearn.metrics import classification_report
preds = trainer.predict(tokenized_eval)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)
print(classification_report(y_true, y_pred, target_names=["Fake","Real"]))

In [None]:
import os

# Directory to save model
SAVE_DIR = "/content/final_model"
os.makedirs(SAVE_DIR, exist_ok=True)

# Save model and tokenizer
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

# Quick sanity check
print("Saved files:", os.listdir(SAVE_DIR))

# Check essential files exist
assert any(
    n.startswith("pytorch_model") or n.endswith(".safetensors")
    for n in os.listdir(SAVE_DIR)
), "❌ Model weights missing!"

assert "config.json" in os.listdir(SAVE_DIR), "❌ config.json missing!"
assert (
    "tokenizer.json" in os.listdir(SAVE_DIR)
    or "spm.model" in os.listdir(SAVE_DIR)
), "❌ Tokenizer files missing!"

print("✅ Model AND tokenizer saved to:", SAVE_DIR)


In [None]:
import gradio as gr
from transformers import pipeline
import wikipedia
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import requests
import re

# ----------------------------
# 1. NewsAPI Key
# ----------------------------
NEWSAPI_KEY = "dae9f2abd8434e50b6f277863fd81fe1"  # replace with your own key

# ----------------------------
# 2. Load your trained model
# ----------------------------
SAVE_DIR = "/content/final_model"
classifier = pipeline(
    "text-classification",
    model=SAVE_DIR,
    tokenizer=SAVE_DIR,
    return_all_scores=False,
    device=-1  # set to 0 if GPU available
)

# ----------------------------
# 3. Sentiment Analyzer
# ----------------------------
analyzer = SentimentIntensityAnalyzer()

# ----------------------------
# 4. Helper: Extract keywords
# ----------------------------
def extract_keywords(text, top_n=3):
    words = re.findall(r"\w+", text)
    stopwords = {
        "the", "is", "in", "on", "of", "for", "a", "an", "to", "and",
        "with", "show", "study", "results"
    }
    keywords = [w for w in words if w.lower() not in stopwords]
    return " ".join(keywords[:top_n]) if keywords else text

# ----------------------------
# 5. Wikipedia Snippet
# ----------------------------
def get_wiki_snippet(q):
    try:
        try_query = q if len(q.split()) < 6 else " ".join(q.split()[:6])
        return wikipedia.summary(try_query, sentences=3)
    except wikipedia.exceptions.DisambiguationError as e:
        return f"Ambiguous term: refine your query. Options: {', '.join(e.options[:5])} ..."
    except Exception:
        return "No relevant Wikipedia page found."

# ----------------------------
# 6. Live News API Fetch
# ----------------------------
def get_live_api_examples(q):
    if not NEWSAPI_KEY:
        return "⚠️ Live API not configured."
    try:
        r = requests.get(
            "https://newsapi.org/v2/everything",
            params={
                "q": q,
                "language": "en",
                "pageSize": 3,
                "sortBy": "publishedAt",
                "apiKey": NEWSAPI_KEY
            },
            timeout=10
        )
        data = r.json()
        if data.get("status") != "ok":
            return f"Live API error: {data.get('message','unknown error')}"
        items = []
        for art in data.get("articles", []):
            items.append(f"- {art.get('title','(no title)')} ({art.get('source',{}).get('name','')})")
        return "\n".join(items) if items else "No recent related articles found."
    except Exception as e:
        return f"Live API request failed: {e}"

# ----------------------------
# 7. Main Analyzer Function
# ----------------------------
def analyze_news(headline):
    headline = (headline or "").strip()
    if not headline:
        return "Please enter a headline."

    # 1) Fake/Real Prediction
    pred = classifier(headline)[0]
    label = pred["label"]
    conf = float(pred["score"])

    # 2) Sentiment
    s = analyzer.polarity_scores(headline)
    sentiment_label = "Neutral"
    if s["compound"] >= 0.05:
        sentiment_label = "Positive"
    elif s["compound"] <= -0.05:
        sentiment_label = "Negative"

    # 3) Wikipedia snippet
    wiki = get_wiki_snippet(headline)

    # 4) Live API (keywords)
    search_query = extract_keywords(headline, top_n=3)
    live_info = get_live_api_examples(search_query)

    # Build Markdown Output
    md = []
    md.append("### 🔍 Fake News Detection")
    md.append(f"**Prediction**: **{label}**  |  **Confidence**: {conf:.2f}")
    md.append("")
    md.append("### 🙂 Sentiment Analysis")
    md.append(f"**Sentiment**: **{sentiment_label}**  |  Scores: {s}")
    md.append("")
    md.append("### 📖 Wikipedia Snippet")
    md.append(wiki)
    md.append("")
    md.append(f"### 📰 Live News (Search: `{search_query}`)")
    md.append(live_info)

    return "\n".join(md)

# ----------------------------
# 8. Gradio UI
# ----------------------------
iface = gr.Interface(
    fn=analyze_news,
    inputs=gr.Textbox(lines=3, label="Enter headline"),
    outputs=gr.Markdown(),
    title="Real-Time Fake News Detection (DeBERTa-v3)",
    description="Classifies news as Fake/Real, gives sentiment, shows Wikipedia snippet, and fetches related live news."
)

iface.launch(share=True, debug=True)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
from transformers import (AutoModelForSequenceClassification, TrainingArguments, Trainer)
import evaluate

id2label = {0: "Fake", 1: "Real"}
label2id = {"Fake": 0, "Real": 1}

MODEL_NAME = "microsoft/deberta-v3-base"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,num_labels=2,id2label=id2label,label2id=label2id)

# Metrics
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  preds = np.argmax(logits, axis=1)
  return {"accuracy": acc_metric.compute(predictions=preds, references=labels)['accuracy'],
"f1": f1_metric.compute(predictions=preds, references=labels, average="macro")['f1'],}

# ✅ Put TrainingArguments here (bef ore Trainer)
training_args = TrainingArguments(output_dir="/content/results",
                                  evaluation_strategy="steps", # Corrected parameter name
                                  save_steps=200,
                                  eval_steps=200,
                                  logging_steps=100,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  greater_is_better=True,
                                  num_train_epochs=3,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  weight_decay=0.01,
                                  report_to="none",
                                  seed=42,)
# Trainer
trainer = Trainer(model=model,args=training_args,train_dataset=tokenized_train,eval_dataset=tokenized_eval,tokenizer=tokenizer,compute_metrics=compute_metrics)
# Training
train_result = trainer.train()
print("Training finished ✅")