In [None]:
# /content/drive/MyDrive/Capstone/
# ├─ data/
# │  ├─ hindi dataset.xlsx        # your original file (you can move it here)
# │  ├─ dataset.csv               # cleaned CSV (auto)
# │  ├─ balanced.csv              # sampled + balanced CSV (auto)
# │  ├─ train.csv, val.csv, test.csv
# ├─ images/                      # placeholder or downloaded images
# ├─ models/                      # saved model weights
# ├─ outputs/                     # confusion matrices, plots, predictions
# ├─ notebooks/                   # final notebook export (optional)


In [1]:
# Colab cell: CHUNK 0
# Mount Drive, install libs, create folders
from google.colab import drive
drive.mount('/content/drive')

ROOT_DIR = "/content/drive/MyDrive/Capstone"
import os
os.makedirs(ROOT_DIR, exist_ok=True)
os.makedirs(os.path.join(ROOT_DIR, "data"), exist_ok=True)
os.makedirs(os.path.join(ROOT_DIR, "images"), exist_ok=True)
os.makedirs(os.path.join(ROOT_DIR, "models"), exist_ok=True)
os.makedirs(os.path.join(ROOT_DIR, "outputs"), exist_ok=True)
os.makedirs(os.path.join(ROOT_DIR, "notebooks"), exist_ok=True)

# Install packages
!pip install -q transformers datasets sentencepiece
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install -q tqdm pandas scikit-learn matplotlib nltk pillow

print("Drive mounted and folders created at:", ROOT_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive mounted and folders created at: /content/drive/MyDrive/Capstone


In [3]:
# Colab cell: CHUNK 1 - place the file into Drive data folder (if it's currently in /content)
import shutil, os
#SRC = "/content/capstone/hindi dataset.xlsx"  # comment out if already in Drive
DST = "/content/drive/MyDrive/Capstone/data/hindi dataset.xlsx"
# if os.path.exists(SRC) and not os.path.exists(DST):
#     shutil.copy(SRC, DST)
#     print("Copied to Drive:", DST)
# else:
print("Make sure your excel is at:", DST)
# Inspect
import pandas as pd
df = pd.read_excel(DST, engine="openpyxl")
print("Columns:", df.columns.tolist())
display(df.head(3))


Make sure your excel is at: /content/drive/MyDrive/Capstone/data/hindi dataset.xlsx
Columns: ['Statement', 'Label', 'Link', 'Web', 'Category', 'Date']


Unnamed: 0,Statement,Label,Link,Web,Category,Date
0,Manufacturing PMI: मार्च में देश की विनिर्माण ...,True,https://www.jagranimages.com/images/newimg/050...,jagran,BUSINESS,2021/4
1,रिजर्व बैंक की मौद्रिक नीति समिति की बैठक आज स...,True,https://www.jagranimages.com/images/newimg/050...,jagran,BUSINESS,2021/4
2,Gold Price Today: सोने के वायदा भाव में गिरावट...,True,https://www.jagranimages.com/images/newimg/050...,jagran,BUSINESS,2021/4


In [4]:
# Colab cell: CHUNK 2 - clean and save dataset.csv
import pandas as pd, os
ROOT = "/content/drive/MyDrive/Capstone"
DATA_DIR = os.path.join(ROOT, "data")
xls = os.path.join(DATA_DIR, "hindi dataset.xlsx")
df = pd.read_excel(xls, engine="openpyxl")

df = df.rename(columns={
    "Statement": "text",
    "Link": "image_link",
    "Label": "label"
})

# Normalize label to 0/1: true/false etc.
df['label'] = df['label'].astype(str).str.lower().map({"true":1,"false":0,"1":1,"0":0})
# Drop rows with missing text
df = df.dropna(subset=['text']).reset_index(drop=True)
# Save
csv_out = os.path.join(DATA_DIR, "dataset.csv")
df.to_csv(csv_out, index=False)
print("Saved cleaned dataset to", csv_out)
print(df['label'].value_counts(dropna=False))
df.head(3)


Saved cleaned dataset to /content/drive/MyDrive/Capstone/data/dataset.csv
label
1.0    21389
NaN     9663
Name: count, dtype: int64


Unnamed: 0,text,label,image_link,Web,Category,Date
0,Manufacturing PMI: मार्च में देश की विनिर्माण ...,1.0,https://www.jagranimages.com/images/newimg/050...,jagran,BUSINESS,2021/4
1,रिजर्व बैंक की मौद्रिक नीति समिति की बैठक आज स...,1.0,https://www.jagranimages.com/images/newimg/050...,jagran,BUSINESS,2021/4
2,Gold Price Today: सोने के वायदा भाव में गिरावट...,1.0,https://www.jagranimages.com/images/newimg/050...,jagran,BUSINESS,2021/4


In [5]:
# Colab cell: CHUNK 3 - sampling & create synthetic fake examples
import pandas as pd, os, random
from sklearn.utils import resample
ROOT = "/content/drive/MyDrive/Capstone"
DATA_DIR = os.path.join(ROOT, "data")
csv_in = os.path.join(DATA_DIR, "dataset.csv")
df = pd.read_csv(csv_in)

# Parameters
TOTAL_SAMPLES = 1000   # total after balancing
HALF = TOTAL_SAMPLES // 2

# keep only rows with valid labels (if some label NaN, treat as real for selection)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

# take upto HALF real samples (if not enough, take all)
real_df = df[df['label']==1]
if len(real_df) >= HALF:
    real_sample = real_df.sample(HALF, random_state=42).reset_index(drop=True)
else:
    real_sample = real_df.sample(HALF, replace=True, random_state=42).reset_index(drop=True)

# create synthetic fakes via simple augmentation:
def make_fake_text(s):
    # simple augmentation: shuffle words, append spammy token
    import random
    words = str(s).split()
    random.shuffle(words)
    spam = random.choice(["यह खुलासा", "आपको आज", "Breaking", "Exclusive", "क्लिक करें"])
    return " ".join(words[:max(3,len(words)//2)]) + " " + spam

fake_texts = []
fake_rows = []
# sample random real rows and augment text to create fake
src_for_fake = df.sample(HALF, replace=True, random_state=1)
for i, row in src_for_fake.iterrows():
    new_text = make_fake_text(row['text'])
    fake_rows.append({
        'text': new_text,
        'image_link': row.get('image_link', ''),
        'label': 0
    })
fake_df = pd.DataFrame(fake_rows)

# Combine and shuffle
balanced = pd.concat([real_sample[['text','image_link','label']], fake_df], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
balanced_path = os.path.join(DATA_DIR, "balanced.csv")
balanced.to_csv(balanced_path, index=False)
print("Balanced dataset saved:", balanced_path)
print(balanced['label'].value_counts())
balanced.head(3)


Balanced dataset saved: /content/drive/MyDrive/Capstone/data/balanced.csv
label
0    500
1    500
Name: count, dtype: int64


Unnamed: 0,text,image_link,label
0,"आतंकी कुछ जारी छिपे एनकाउंटर के आशंका, एनकाउंट...",https://images.bhaskarassets.com/thumb/191x143...,0
1,"लोगों होगा मुश्किल राशिफल: योजना शनिवार रहेगा,...",https://images.bhaskarassets.com/thumb/191x143...,0
2,"चांदी सोने Feb: की आई गिरावट, Rate 3 आपको आज",https://www.jagranimages.com/images/newimg/030...,0


In [6]:
# Colab cell: CHUNK 4 - create placeholder images and attach image_path
from PIL import Image, ImageDraw, ImageFont
import os, random, pandas as pd
ROOT = "/content/drive/MyDrive/Capstone"
DATA_DIR = os.path.join(ROOT, "data")
IMG_DIR = os.path.join(ROOT, "images")
os.makedirs(IMG_DIR, exist_ok=True)

balanced = pd.read_csv(os.path.join(DATA_DIR, "balanced.csv"))

def make_placeholder(idx):
    # create a simple colored placeholder image with index text
    w,h = 224,224
    img = Image.new("RGB", (w,h), color=(random.randint(64,220),random.randint(64,220),random.randint(64,220)))
    d = ImageDraw.Draw(img)
    txt = str(idx)
    # draw small index
    d.text((10,10), txt, fill=(20,20,20))
    path = os.path.join(IMG_DIR, f"placeholder_{idx}.jpg")
    img.save(path, quality=85)
    return path

paths = []
for i in range(len(balanced)):
    p = make_placeholder(i)
    paths.append(p)

balanced['image_path'] = paths
balanced.to_csv(os.path.join(DATA_DIR, "balanced_with_images.csv"), index=False)
print("Created", len(paths), "placeholder images at", IMG_DIR)
balanced.head(3)


Created 1000 placeholder images at /content/drive/MyDrive/Capstone/images


Unnamed: 0,text,image_link,label,image_path
0,"आतंकी कुछ जारी छिपे एनकाउंटर के आशंका, एनकाउंट...",https://images.bhaskarassets.com/thumb/191x143...,0,/content/drive/MyDrive/Capstone/images/placeho...
1,"लोगों होगा मुश्किल राशिफल: योजना शनिवार रहेगा,...",https://images.bhaskarassets.com/thumb/191x143...,0,/content/drive/MyDrive/Capstone/images/placeho...
2,"चांदी सोने Feb: की आई गिरावट, Rate 3 आपको आज",https://www.jagranimages.com/images/newimg/030...,0,/content/drive/MyDrive/Capstone/images/placeho...


In [7]:
# Colab cell: CHUNK 5 - split dataset
import pandas as pd, os
from sklearn.model_selection import train_test_split
ROOT = "/content/drive/MyDrive/Capstone"
DATA_DIR = os.path.join(ROOT, "data")

df = pd.read_csv(os.path.join(DATA_DIR, "balanced_with_images.csv"))
train, temp = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val, test = train_test_split(temp, test_size=0.5, stratify=temp['label'], random_state=42)

train.to_csv(os.path.join(DATA_DIR, "train.csv"), index=False)
val.to_csv(os.path.join(DATA_DIR, "val.csv"), index=False)
test.to_csv(os.path.join(DATA_DIR, "test.csv"), index=False)
print("Sizes -> train:", len(train), "val:", len(val), "test:", len(test))


Sizes -> train: 800 val: 100 test: 100


In [8]:
# Colab cell: CHUNK 6A - dataset & preprocessing
import torch
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd
import os
import nltk
nltk.download('punkt', quiet=True)
from transformers import AutoTokenizer

ROOT = "/content/drive/MyDrive/Capstone"
DATA_DIR = os.path.join(ROOT, "data")
TOKENIZER_NAME = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

class MultimodalDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_length=64, transform=None):
        self.df = pd.read_csv(csv_path).reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.transform = transform
        if self.transform is None:
            from torchvision import transforms
            self.transform = transforms.Compose([
                transforms.Resize((224,224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
            ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row['text'])
        img_path = row['image_path']
        label = float(row['label'])
        # tokenize
        enc = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = enc['input_ids'].squeeze(0)
        attn_mask = enc['attention_mask'].squeeze(0)
        # image
        try:
            img = Image.open(img_path).convert('RGB')
        except:
            img = Image.new("RGB", (224,224), color=(200,200,200))
        img = self.transform(img)
        return img, input_ids, attn_mask, torch.tensor(label).float()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [9]:
# Colab cell: CHUNK 6B - model (DistilBERT text encoder + ResNet18 image encoder + fusion)
import torch
import torch.nn as nn
import torchvision.models as models
from transformers import AutoModel

class MultimodalModel(nn.Module):
    def __init__(self, text_model_name="distilbert-base-multilingual-cased", freeze_text=True, freeze_image=True):
        super().__init__()
        # text encoder
        self.text_model = AutoModel.from_pretrained(text_model_name)
        if freeze_text:
            for p in self.text_model.parameters():
                p.requires_grad = False
        text_dim = self.text_model.config.hidden_size  # typically 768 for DistilBERT

        # image encoder
        resnet = models.resnet18(pretrained=True)
        modules = list(resnet.children())[:-1]  # remove fc
        self.image_backbone = nn.Sequential(*modules)
        if freeze_image:
            for p in self.image_backbone.parameters():
                p.requires_grad = False
        img_dim = resnet.fc.in_features  # 512

        # fusion classifier
        fusion_dim = text_dim + img_dim
        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1)
        )

    def forward(self, img, input_ids, attn_mask):
        # image forward
        img_feat = self.image_backbone(img)  # B x feat x 1 x 1
        img_feat = img_feat.view(img_feat.size(0), -1)  # B x img_dim

        # text forward (DistilBERT returns last_hidden_state)
        t_out = self.text_model(input_ids=input_ids, attention_mask=attn_mask)
        # use [CLS]-like pooling: mean of token embeddings masked
        last_hidden = t_out.last_hidden_state  # B x L x hidden
        mask = attn_mask.unsqueeze(-1)
        sum_hidden = (last_hidden * mask).sum(1)
        lengths = mask.sum(1).clamp(min=1)
        text_feat = sum_hidden / lengths  # B x text_dim

        fused = torch.cat([img_feat, text_feat], dim=1)
        logits = self.classifier(fused).squeeze(1)
        return logits


In [10]:
# Colab cell: CHUNK 7 - training
import torch, os, numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
ROOT = "/content/drive/MyDrive/Capstone"
DATA_DIR = os.path.join(ROOT, "data")
OUT_DIR = os.path.join(ROOT, "outputs")
MODEL_DIR = os.path.join(ROOT, "models")
os.makedirs(OUT_DIR, exist_ok=True); os.makedirs(MODEL_DIR, exist_ok=True)

# Config
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
VAL_CSV   = os.path.join(DATA_DIR, "val.csv")
TEST_CSV  = os.path.join(DATA_DIR, "test.csv")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 8
EPOCHS = 3
LR = 1e-4

# Datasets & loaders
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
train_ds = MultimodalDataset(TRAIN_CSV, tokenizer, max_length=64)
val_ds = MultimodalDataset(VAL_CSV, tokenizer, max_length=64)
test_ds = MultimodalDataset(TEST_CSV, tokenizer, max_length=64)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# Model
model = MultimodalModel().to(DEVICE)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)

best_val_loss = 1e9
for epoch in range(1, EPOCHS+1):
    model.train()
    train_losses=[]
    for imgs, input_ids, attn_mask, labels in train_loader:
        imgs = imgs.to(DEVICE); input_ids = input_ids.to(DEVICE); attn_mask = attn_mask.to(DEVICE); labels = labels.to(DEVICE)
        logits = model(imgs, input_ids, attn_mask)
        loss = criterion(logits, labels)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        train_losses.append(loss.item())
    avg_train = np.mean(train_losses)

    # validation
    model.eval()
    all_preds=[]; all_labels=[]; val_losses=[]
    with torch.no_grad():
        for imgs, input_ids, attn_mask, labels in val_loader:
            imgs = imgs.to(DEVICE); input_ids = input_ids.to(DEVICE); attn_mask = attn_mask.to(DEVICE); labels = labels.to(DEVICE)
            logits = model(imgs, input_ids, attn_mask)
            loss = criterion(logits, labels)
            val_losses.append(loss.item())
            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs >= 0.5).astype(int)
            all_preds.extend(preds.tolist()); all_labels.extend(labels.cpu().numpy().astype(int).tolist())
    avg_val = np.mean(val_losses)
    acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch}: TrainLoss {avg_train:.4f} ValLoss {avg_val:.4f} ValAcc {acc:.4f}")
    print(classification_report(all_labels, all_preds, digits=4))
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(4,3)); sns.heatmap(cm, annot=True, fmt='d'); plt.title(f"CM epoch{epoch}"); plt.savefig(os.path.join(OUT_DIR, f"cm_epoch_{epoch}.png")); plt.close()
    torch.save(model.state_dict(), os.path.join(MODEL_DIR, f"model_epoch_{epoch}.pth"))
    if avg_val < best_val_loss:
        best_val_loss = avg_val
        torch.save(model.state_dict(), os.path.join(MODEL_DIR, "best_model.pth"))
        print("Saved best model.")


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]



Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:02<00:00, 22.3MB/s]


Epoch 1: TrainLoss 0.6837 ValLoss 0.6526 ValAcc 0.6000
              precision    recall  f1-score   support

           0     0.5641    0.8800    0.6875        50
           1     0.7273    0.3200    0.4444        50

    accuracy                         0.6000       100
   macro avg     0.6457    0.6000    0.5660       100
weighted avg     0.6457    0.6000    0.5660       100

Saved best model.
Epoch 2: TrainLoss 0.6089 ValLoss 0.6300 ValAcc 0.6100
              precision    recall  f1-score   support

           0     1.0000    0.2200    0.3607        50
           1     0.5618    1.0000    0.7194        50

    accuracy                         0.6100       100
   macro avg     0.7809    0.6100    0.5400       100
weighted avg     0.7809    0.6100    0.5400       100

Saved best model.
Epoch 3: TrainLoss 0.5736 ValLoss 0.5207 ValAcc 0.8000
              precision    recall  f1-score   support

           0     0.8261    0.7600    0.7917        50
           1     0.7778    0.8400   

In [12]:
# Colab cell: CHUNK 8 - Final report + visualization
import torch, os, pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score, roc_curve
)
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

# --- Paths ---
ROOT = "/content/drive/MyDrive/Capstone"
DATA_DIR = os.path.join(ROOT, "data")
MODEL_DIR = os.path.join(ROOT, "models")
OUT_DIR = os.path.join(ROOT, "outputs")
os.makedirs(OUT_DIR, exist_ok=True)

# --- Load best model ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalModel().to(DEVICE)
best_model_path = os.path.join(MODEL_DIR, "best_model.pth")
model.load_state_dict(torch.load(best_model_path, map_location=DEVICE))
model.eval()

# --- Prepare test data ---
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
test_csv = os.path.join(DATA_DIR, "test.csv")
test_df = pd.read_csv(test_csv)
test_ds = MultimodalDataset(test_csv, tokenizer, max_length=64)
loader = DataLoader(test_ds, batch_size=8, shuffle=False)

# --- Inference ---
all_preds, all_probs, all_labels = [], [], []
with torch.no_grad():
    for imgs, input_ids, attn_mask, labels in loader:
        imgs, input_ids, attn_mask = imgs.to(DEVICE), input_ids.to(DEVICE), attn_mask.to(DEVICE)
        logits = model(imgs, input_ids, attn_mask)
        probs = torch.sigmoid(logits).cpu().numpy()
        preds = (probs >= 0.5).astype(int)
        all_probs.extend(probs.tolist())
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.numpy().astype(int).tolist())

# --- Metrics ---
acc  = accuracy_score(all_labels, all_preds)
prec = precision_score(all_labels, all_preds, zero_division=0)
rec  = recall_score(all_labels, all_preds, zero_division=0)
f1   = f1_score(all_labels, all_preds, zero_division=0)
auc  = roc_auc_score(all_labels, all_probs)

print("📊 Final Evaluation Metrics")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"ROC-AUC  : {auc:.4f}\n")
print(classification_report(all_labels, all_preds, digits=4))

# --- Confusion Matrix ---
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "confusion_matrix.png"))
plt.close()

# --- ROC Curve ---
fpr, tpr, _ = roc_curve(all_labels, all_probs)
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"ROC (AUC={auc:.2f})")
plt.plot([0,1],[0,1],'--',color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "roc_curve.png"))
plt.close()

# --- Save detailed predictions ---
out = test_df.copy()
out["probability"] = np.round(all_probs, 3)
out["prediction"]  = all_preds
out.to_csv(os.path.join(OUT_DIR, "test_predictions.csv"), index=False)

# --- Generate Architecture Diagram ---
import matplotlib.patches as mpatches
fig, ax = plt.subplots(figsize=(8,4))
ax.axis("off")

# Draw blocks
def block(x, y, w, h, label, color):
    rect = mpatches.FancyBboxPatch((x, y), w, h,
                                   boxstyle="round,pad=0.02", fc=color, ec="black")
    ax.add_patch(rect)
    ax.text(x+w/2, y+h/2, label, ha="center", va="center", fontsize=10, color="black")

block(0.1, 0.6, 0.25, 0.2, "Text\n(DistilBERT)", "#b3d9ff")
block(0.1, 0.2, 0.25, 0.2, "Image\n(ResNet18)", "#ffc266")
block(0.45, 0.4, 0.2, 0.2, "Feature Fusion\n(Concat)", "#d9b3ff")
block(0.75, 0.4, 0.15, 0.2, "Classifier\n(Dense+Sigmoid)", "#a3ffb3")

ax.arrow(0.35,0.7,0.1,-0.2,head_width=0.02,fc="k",ec="k")
ax.arrow(0.35,0.3,0.1,0.2,head_width=0.02,fc="k",ec="k")
ax.arrow(0.65,0.5,0.08,0,head_width=0.02,fc="k",ec="k")
ax.text(0.9,0.52,"Prediction\nReal / Fake",fontsize=9,va="center")
plt.title("Multimodal Fake News Detector Architecture", fontsize=12, pad=20)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "architecture_diagram.png"))
plt.close()

# --- Generate quick text summary report ---
report_txt = f"""
Fake News Detection - Evaluation Report
=======================================
Total Samples : {len(all_labels)}
Accuracy       : {acc:.4f}
Precision      : {prec:.4f}
Recall         : {rec:.4f}
F1 Score       : {f1:.4f}
ROC-AUC        : {auc:.4f}

Files generated:
 - confusion_matrix.png
 - roc_curve.png
 - architecture_diagram.png
 - test_predictions.csv
 - best_model.pth (trained weights)

Interpretation:
Model uses DistilBERT for Hindi/English text embeddings and ResNet18 for image features.
Both feature vectors are concatenated and passed through dense layers to output a
probability of "Real (1)" or "Fake (0)". The above metrics and plots summarize performance.

"""

with open(os.path.join(OUT_DIR, "report_summary.txt"), "w", encoding="utf-8") as f:
    f.write(report_txt)

print("✅ Report complete!  All outputs saved to:", OUT_DIR)
for file in ["confusion_matrix.png","roc_curve.png","architecture_diagram.png","report_summary.txt"]:
    print(" -", file)




📊 Final Evaluation Metrics
Accuracy : 0.8400
Precision: 0.8542
Recall   : 0.8200
F1-score : 0.8367
ROC-AUC  : 0.9580

              precision    recall  f1-score   support

           0     0.8269    0.8600    0.8431        50
           1     0.8542    0.8200    0.8367        50

    accuracy                         0.8400       100
   macro avg     0.8405    0.8400    0.8399       100
weighted avg     0.8405    0.8400    0.8399       100

✅ Report complete!  All outputs saved to: /content/drive/MyDrive/Capstone/outputs
 - confusion_matrix.png
 - roc_curve.png
 - architecture_diagram.png
 - report_summary.txt
