In [10]:
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm



In [2]:
fake = pd.read_csv("/content/drive/MyDrive/fake .csv")
true = pd.read_csv("/content/drive/MyDrive/true  .csv")

fake["label"] = 0
true["label"] = 1

df = pd.concat([fake, true], axis=0).reset_index(drop=True)


In [3]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [4]:
df["content"] = df["title"].astype(str) + " </s> " + df["text"].astype(str)


In [5]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["content"] = df["content"].apply(clean_text)


In [6]:
train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    stratify=df["label"],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df["label"],
    random_state=42
)


In [None]:
MAX_LEN = 512
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [12]:
train_ds = NewsDataset(train_df["content"].tolist(), train_df["label"].tolist())
val_ds   = NewsDataset(val_df["content"].tolist(), val_df["label"].tolist())

train_loader = DataLoader(train_ds, batch_size=4, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=4)


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=2
).to(device)

weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=train_df["label"]
)

criterion = CrossEntropyLoss(
    weight=torch.tensor(weights, dtype=torch.float).to(device)
)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} Train Loss: {total_loss / len(train_loader):.4f}")


Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7857/7857 [57:47<00:00,  2.27it/s]


Epoch 1 Train Loss: 0.0098


Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7857/7857 [57:26<00:00,  2.28it/s]


Epoch 2 Train Loss: 0.0052


Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7857/7857 [57:25<00:00,  2.28it/s]

Epoch 3 Train Loss: 0.0054





In [15]:
model.save_pretrained("model/roberta_model")
tokenizer.save_pretrained("model/roberta_model")


('model/roberta_model/tokenizer_config.json',
 'model/roberta_model/special_tokens_map.json',
 'model/roberta_model/vocab.json',
 'model/roberta_model/merges.txt',
 'model/roberta_model/added_tokens.json')

In [16]:
test_ds = NewsDataset(
    test_df["content"].tolist(),
    test_df["label"].tolist()
)

test_loader = DataLoader(
    test_ds,
    batch_size=4,
    shuffle=False   # ‚ùó NEVER shuffle test data
)


In [17]:
from sklearn.metrics import classification_report, roc_auc_score

def test_model(model, dataloader):
    model.eval()

    all_labels = []
    all_preds = []
    all_probs = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            probs = torch.softmax(outputs.logits, dim=1)
            preds = torch.argmax(probs, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs[:, 1].cpu().numpy())

    return all_labels, all_preds, all_probs


In [18]:
y_true, y_pred, y_prob = test_model(model, test_loader)

print("\n=== UNSEEN TEST DATA RESULTS ===")
print(classification_report(y_true, y_pred))
print("ROC-AUC:", roc_auc_score(y_true, y_prob))



=== UNSEEN TEST DATA RESULTS ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3523
           1       1.00      1.00      1.00      3212

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735

ROC-AUC: 0.9987547583589639


In [19]:
import torch
import re
from transformers import RobertaTokenizer, RobertaForSequenceClassification


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = RobertaTokenizer.from_pretrained("model/roberta_model")
model = RobertaForSequenceClassification.from_pretrained("model/roberta_model")
model.to(device)
model.eval()


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [21]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [27]:
title = "Scientists Announce Discovery of Pill That Instantly Reverses Aging"
text = """A team of international scientists announced on Monday that they have developed a pill capable of instantly reversing the human aging process..."""



In [32]:
title = "City Expands Public Transportation to Reduce Traffic and Emissions"
text = """The city government announced on Wednesday that it will expand its public transportation network as part of a broader plan to reduce traffic congestion and lower carbon emissions..."""


In [37]:
title = "Health Ministry Launches Nationwide Vaccination Campaign Ahead of Flu Season"
text = """The Ministry of Health announced on Monday the launch of a nationwide vaccination campaign aimed at protecting vulnerable populations ahead of the upcoming flu season..."""


In [42]:
title = "Cambodia Boosts Infrastructure and Economic Recovery in 2025"
text = "Cambodia continues to strengthen its economic recovery in 2025 as public investment and regional trade show steady growth. According to recent government statements, infrastructure projects remain a top priority, with ongoing upgrades to national roads, rail links, and port facilities aimed at improving logistics and attracting foreign investment. In Cambodia, construction work on transport corridors connecting major economic zones has helped reduce travel time and support domestic businesses. In the capital, Phnom Penh, authorities reported increased activity in the manufacturing and service sectors, particularly garments, tourism, and digital services. Officials also highlighted efforts to support small and medium-sized enterprises through easier access to financing and vocational training programs. At the same time, the government reaffirmed its commitment to social development by expanding healthcare coverage and improving education quality in rural areas. These combined measures are expected to sustain moderate economic growth while improving living standards nationwide."

In [47]:
title = "German court jails man for drugging, raping and filming wife for years"
text = "A court in the German city of Aachen has sentenced a man to eight-and-a-half years in prison for repeatedly drugging and raping his wife, filming the acts and then posting them online. The man, named only as Fernando P in line with German privacy laws, was found guilty of aggravated rape, grievous bodily harm and violation of personal privacy. The court found the 61-year-old, who is originally from Spain, guilty of sedating and raping his wife at their home for a period of several years from 2018 to 2024. A spokesperson for the court, Katharina Effert, said the man also filmed the abuse and shared it online."

In [53]:
title = "Thailand Announces Free AI Chips for All Citizens"
text = "The government of Thailand has announced a groundbreaking initiative to distribute free artificial intelligence microchips to every citizen by the end of this year, officials claimed on social media yesterday. According to the announcement, the chips will be implanted in smartphones to instantly boost internet speed, eliminate online scams, and automatically translate all foreign languages in real time. Authorities in Bangkok reportedly stated that the project will transform Thailand into the world‚Äôs first fully ‚ÄúAI-powered nation,‚Äù increasing productivity by 300 percent. The plan also claims that traffic congestion will disappear within weeks due to AI-controlled roads and vehicles. Despite widespread excitement online, no official documents, budget details, or international partners have been confirmed. Technology experts have questioned the feasibility of the project, noting that such technology does not currently exist at a national scale."

In [54]:
content = title + " </s> " + text
content = clean_text(content)


In [55]:
inputs = tokenizer(
    content,
    padding="max_length",
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

inputs = {k: v.to(device) for k, v in inputs.items()}


In [56]:
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)[0]


In [57]:
label = torch.argmax(probs).item()
confidence = probs[label].item()

if label == 0:
    print("üü• Prediction: FAKE news")
else:
    print("üü© Prediction: REAL news")

print(f"Confidence: {confidence * 100:.2f}%")


üü• Prediction: FAKE news
Confidence: 99.99%
