In [None]:
import numpy as np
import pandas as pd
import nltk
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from scipy.sparse import hstack
from xgboost import XGBClassifier
from scipy.stats import uniform, randint

# Download NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load the dataset
url = 'https://github.com/sm2hashmi/ML-and-DL-projects/raw/main/datasets/news_articles.csv'
df = pd.read_csv(url)
df_clean = df.dropna(subset=['label', 'text'])

# Initialize the tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# BERT Model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to tokenize and encode text using BERT
def encode_text(text, tokenizer, max_length=512):
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',  # Ensures all sequences have the same length
        truncation=True,  # Truncate sequences longer than max_length
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

# Custom dataset class for loading the data
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        input_ids, attention_mask = encode_text(text, self.tokenizer, self.max_length)
        return input_ids.squeeze(), attention_mask.squeeze(), torch.tensor(label)

# Convert text to BERT embeddings
def get_bert_embeddings(data_loader, model, device):
    model = model.to(device)
    model.eval()
    embeddings = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, _ = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = outputs.last_hidden_state
            cls_embeddings = last_hidden_state[:, 0, :]  # Use [CLS] token representation
            embeddings.append(cls_embeddings.cpu().numpy())

    return np.vstack(embeddings)

# Prepare the dataset for BERT
max_length = 128  # Maximum length for BERT input
texts = df_clean['text'].tolist()
labels = df_clean['label'].map({'Real': 1, 'Fake': 0}).tolist()
dataset = NewsDataset(texts, labels, tokenizer, max_length=max_length)

# DataLoader for BERT embeddings
batch_size = 16
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Get BERT embeddings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_embeddings = get_bert_embeddings(data_loader, bert_model, device)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(df_clean['text'])

# Combine TF-IDF and BERT Embeddings
X_combined = hstack([tfidf_features, bert_embeddings])

# Labels
y = np.array(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_smote), y=y_train_smote)
scale_pos_weight = class_weights[0] / class_weights[1]

# Initialize the XGBClassifier
xgb_model = XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'gamma': uniform(0, 0.5)
}

# RandomizedSearchCV setup
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20,  # Number of parameter settings that are sampled
    scoring='f1',  # Optimize for F1 score
    cv=3,  # 3-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit the model
random_search.fit(X_train_smote, y_train_smote)

# Best model from RandomizedSearchCV
best_xgb_model = random_search.best_estimator_

# Predict and evaluate the model
y_pred = best_xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.7878048780487805
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       249
           1       0.76      0.66      0.71       161

    accuracy                           0.79       410
   macro avg       0.78      0.77      0.77       410
weighted avg       0.79      0.79      0.78       410

