In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Load the datasets
fake_data = pd.read_csv('data/Fake.csv')[:100]
true_data = pd.read_csv('data/True.csv')[:100]

# Preprocess the data
fake_data['label'] = 1  # Fake news label
true_data['label'] = 0  # Real news label

# Concatenate the datasets
combined_data = pd.concat([fake_data, true_data])


In [3]:
combined_data

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
95,House panel chair introduces $81 billion disas...,WASHINGTON (Reuters) - The chairman of the U.S...,politicsNews,"December 19, 2017",0
96,Trump nominates Liberty University professor t...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"December 19, 2017",0
97,Trump on Twitter (Dec 18) - Congressional Race...,The following statements were posted to the ve...,politicsNews,"December 18, 2017",0
98,Trump Cabinet officials to visit Puerto Rico t...,WASHINGTON (Reuters) - Two members of Presiden...,politicsNews,"December 19, 2017",0


In [4]:
# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    combined_data['text'], combined_data['label'], test_size=0.2, random_state=42)


In [5]:
# Define a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [6]:
# Initialize the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Provide the output directory path here
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Change save strategy to match evaluation strategy
    logging_dir="./logs",
    logging_steps=100,
    save_steps=1000,
    warmup_steps=500,
    weight_decay=0.01,
    logging_first_step=True,
    load_best_model_at_end=True,
    # metric_for_best_model="accuracy",
    greater_is_better=True,
)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:

# Create instances of the datasets
train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_length=512)
test_dataset = NewsDataset(test_texts, test_labels, tokenizer, max_length=512)


In [8]:

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [9]:


# Fine-tune the model
trainer.train()


  0%|          | 1/240 [00:02<09:52,  2.48s/it]

{'loss': 0.6975, 'grad_norm': 2.36472487449646, 'learning_rate': 1.0000000000000001e-07, 'epoch': 0.01}


                                                
 33%|███▎      | 80/240 [00:36<01:05,  2.45it/s]

{'eval_loss': 0.456595242023468, 'eval_runtime': 1.8945, 'eval_samples_per_second': 21.114, 'eval_steps_per_second': 10.557, 'epoch': 1.0}


 42%|████▏     | 100/240 [00:47<00:58,  2.39it/s]

{'loss': 0.5969, 'grad_norm': 12.46007251739502, 'learning_rate': 1e-05, 'epoch': 1.25}


                                                 
 67%|██████▋   | 160/240 [01:13<00:32,  2.49it/s]

{'eval_loss': 0.009769042953848839, 'eval_runtime': 1.9118, 'eval_samples_per_second': 20.923, 'eval_steps_per_second': 10.462, 'epoch': 2.0}


 83%|████████▎ | 200/240 [01:32<00:15,  2.63it/s]

{'loss': 0.0626, 'grad_norm': 0.10666202753782272, 'learning_rate': 2e-05, 'epoch': 2.5}


                                                 
100%|██████████| 240/240 [01:50<00:00,  2.59it/s]

{'eval_loss': 0.0022792653180658817, 'eval_runtime': 1.9158, 'eval_samples_per_second': 20.879, 'eval_steps_per_second': 10.44, 'epoch': 3.0}


100%|██████████| 240/240 [01:53<00:00,  2.11it/s]

{'train_runtime': 113.7806, 'train_samples_per_second': 4.219, 'train_steps_per_second': 2.109, 'train_loss': 0.27584396830449504, 'epoch': 3.0}





TrainOutput(global_step=240, training_loss=0.27584396830449504, metrics={'train_runtime': 113.7806, 'train_samples_per_second': 4.219, 'train_steps_per_second': 2.109, 'train_loss': 0.27584396830449504, 'epoch': 3.0})

In [10]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = test_labels.to_numpy()

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, pred_labels, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")

100%|██████████| 20/20 [00:02<00:00,  8.33it/s]

Accuracy: 0.975
Precision: 0.9545454545454546
Recall: 1.0
F1 Score: 0.9767441860465116





In [1]:
## Backup

In [None]:
!pip install pandas numpy scikit-learn transformers torch
!pip install accelerate -U

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
import torch


# Load the datasets
fake_data = pd.read_csv('data/Fake.csv')
true_data = pd.read_csv('data/True.csv')

# Preprocess the data
fake_data['label'] = 1  # Fake news label
true_data['label'] = 0  # Real news label

# Concatenate the datasets
combined_data = pd.concat([fake_data, true_data])

combined_data
# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    combined_data['text'], combined_data['label'], test_size=0.2, random_state=42)

# Define a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",  # Provide the output directory path here
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Change save strategy to match evaluation strategy
    logging_dir="./logs",
    logging_steps=100,
    save_steps=1000,
    warmup_steps=500,
    weight_decay=0.01,
    logging_first_step=True,
    load_best_model_at_end=True,
    # metric_for_best_model="accuracy",
    greater_is_better=True,
)


# Create instances of the datasets
train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_length=512)
test_dataset = NewsDataset(test_texts, test_labels, tokenizer, max_length=512)



# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = test_labels.to_numpy()

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, pred_labels, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")

In [None]:
# Save the fine-tuned model
trainer.model.save_pretrained("./fine_tuned_model")

# Load the saved model for inference
model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_model")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import shutil
# Define source and destination paths
source_dir = "/content/fine_tuned_model"  # Replace with the path to your saved model directory in Colab
destination_dir = "/content/drive/MyDrive/fakenews/"  # Replace with the destination directory in your Google Drive

# Copy the entire directory to Google Drive
shutil.copytree(source_dir, destination_dir + "fine_tuned_model")
source_dir = "/content/results"
shutil.copytree(source_dir, destination_dir + "results")
source_dir = "/content/logs"
shutil.copytree(source_dir, destination_dir + "logs")