In [1]:
import os

In [2]:
import pandas as pd

In [3]:
import torch

In [4]:
from torch.utils.data import Dataset, DataLoader

In [5]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

In [6]:
from tqdm.auto import tqdm

In [7]:
import spacy

In [8]:

import random

In [9]:
print("Pandas version:", pd.__version__)
print("SpaCy version:", spacy.__version__)
print("Torch version:", torch.__version__)
print("Cuda version:", torch.version.cuda)

Pandas version: 2.3.0
SpaCy version: 3.8.7
Torch version: 2.7.1+cpu
Cuda version: None


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [11]:
# 1. Path & fraction
data_path   = os.path.join('../../data', 'bigFakeNews', 'dataFiltered.csv')
df = pd.read_csv(data_path, nrows=100)

print(f"Sampled shape: {df.shape}")


Sampled shape: (100, 3)


In [12]:
# 2. Prepare list of strings
texts = df['text'].astype(str).tolist()

# Preprocessing
nlp = spacy.load("en_core_web_sm")


#    - batch_size: how many texts per batch  
#    - n_process: number of CPU cores to use (set to 1 on Windows)
docs = nlp.pipe(texts, batch_size=10, n_process=1)

In [13]:


processed_texts = []
for i, doc in enumerate(docs, start=1):
    tokens = (
        token.lemma_.lower()
        for token in doc
        if not token.is_stop and not token.is_punct
    )
    processed_texts.append(" ".join(tokens))
    
    # every 10 000 docs, print a status line
    if i % 10 == 0 or i == len(texts):
        print(f"Processed {i}/{len(texts)} texts ({i/len(texts)*100:.1f}%)")
        
df['text'] = processed_texts

Processed 10/100 texts (10.0%)
Processed 20/100 texts (20.0%)
Processed 30/100 texts (30.0%)
Processed 40/100 texts (40.0%)
Processed 50/100 texts (50.0%)
Processed 60/100 texts (60.0%)
Processed 70/100 texts (70.0%)
Processed 80/100 texts (80.0%)
Processed 90/100 texts (90.0%)
Processed 100/100 texts (100.0%)


In [14]:
#df.to_pickle("data_with_clean_text.pkl")

In [15]:
model_dir = './models/Saurabh Shahane-DistilBert-simple'
# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(model_dir)
# Load model
model = DistilBertForSequenceClassification.from_pretrained(model_dir)
model.to(device)
model.eval()
print("Tokenizer and model loaded successfully.")

Tokenizer and model loaded successfully.


In [16]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['label'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Instantiate dataset and DataLoader
dataset = FakeNewsDataset(
    texts=df['text'].tolist(),
    labels=df['label'].tolist(),
    tokenizer=tokenizer
)
loader = DataLoader(dataset, batch_size=32)
print(f"DataLoader ready with {len(dataset)} samples.")

DataLoader ready with 100 samples.


In [17]:
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in tqdm(loader, desc="Inference"):
        inputs = {k: v.to(device) for k, v in batch.items() if k in ['input_ids','attention_mask']}
        labels = batch['label'].to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
# Compute overall accuracy
correct = sum(p == l for p, l in zip(all_preds, all_labels))
total = len(all_labels)
accuracy = correct / total
print(f"\nAccuracy on full dataset: {accuracy:.4f} ({correct}/{total})")

Inference:   0%|          | 0/4 [00:00<?, ?it/s]


Accuracy on full dataset: 0.2300 (23/100)


In [18]:
# Display a small sample of predictions
df_results = pd.DataFrame({
    'text': df['text'].iloc[:10],
    'label': df['label'].iloc[:10],
    'prediction': all_preds[:10]
})
df_results

Unnamed: 0,text,label,prediction
0,abigail ratchford‘s instagram page force recko...,0,1
1,say love like sweat stain scratch toe curl rom...,0,1
2,ryan kovacik vcu premed student certify emt pe...,0,1
3,quarter beer pong boat race flip cup perfectly...,0,0
4,tonight \n\n sophia bush chicago pd \n\n 10 pm...,0,0
5,rate \n\n oooouuuuuuuucchh f**k get hurt help ...,0,1
6,say say new series design help dude understand...,0,1
7,say say new dating sex relationship series des...,0,1
8,say say coed dating sex relationship debate se...,0,1
9,converse 1 choice 1920s b ball star indie rock...,0,1
