In [32]:
# Merge True and Fake News Data 
import pandas as pd 
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer
true = pd.read_csv("True.csv")
fake = pd.read_csv("Fake.csv")
true["label"]  = 0
fake["label"] = 1 #using 1 for fake since we want to detect fake news

sample_true = true.sample(n=5000)
sample_true['text'] = sample_true['text'].astype(str)
sample_true['text'] = sample_true['text'].apply(lambda x: x.split(' - ', 1)[1] if isinstance(x, str) and ' - ' in x else x)


sample_fake = fake.sample(n=5000)

print(sample_true['text'].head())



1592     Wisconsin, Ohio, California and 10 other state...
4048     President Donald Trump on Wednesday attacked a...
13063    Venezuela s powerful former oil czar Rafael Ra...
13383    Emperor Akihito, who has spent much of his nea...
11631    Lebanese Prime Minister Saad al-Hariri said on...
Name: text, dtype: object


In [33]:
sample_false = fake.sample(n=5000)

df = pd.concat([sample_true,sample_fake])

df = df.sample(frac=1).reset_index(drop=True) # shuffle


train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512)

In [34]:
import torch

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels.tolist())
val_dataset = NewsDataset(val_encodings, val_labels.tolist())

In [35]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import accelerate

# Load model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1500 [00:00<?, ?it/s]

{'loss': 0.6883, 'grad_norm': 2.1348206996917725, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.02}
{'loss': 0.6918, 'grad_norm': 1.1100388765335083, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.04}
{'loss': 0.6785, 'grad_norm': 1.9762234687805176, 'learning_rate': 3e-06, 'epoch': 0.06}
{'loss': 0.6779, 'grad_norm': 1.4350132942199707, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.08}
{'loss': 0.656, 'grad_norm': 1.3762658834457397, 'learning_rate': 5e-06, 'epoch': 0.1}
{'loss': 0.6115, 'grad_norm': 2.3143560886383057, 'learning_rate': 6e-06, 'epoch': 0.12}
{'loss': 0.511, 'grad_norm': 3.2735044956207275, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.14}
{'loss': 0.3869, 'grad_norm': 2.989393949508667, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.16}
{'loss': 0.2709, 'grad_norm': 2.673880100250244, 'learning_rate': 9e-06, 'epoch': 0.18}
{'loss': 0.2154, 'grad_norm': 4.164487361907959, 'learning_rate': 1e-05, 'epoch': 0.2}
{'loss': 0.1667, 'grad_norm': 7.011

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.03877260163426399, 'eval_runtime': 33.7642, 'eval_samples_per_second': 59.234, 'eval_steps_per_second': 3.702, 'epoch': 1.0}
{'loss': 0.0327, 'grad_norm': 0.011079901829361916, 'learning_rate': 4.9500000000000004e-05, 'epoch': 1.02}
{'loss': 0.0395, 'grad_norm': 0.00945274531841278, 'learning_rate': 4.9e-05, 'epoch': 1.04}
{'loss': 0.0216, 'grad_norm': 0.021577244624495506, 'learning_rate': 4.85e-05, 'epoch': 1.06}
{'loss': 0.0007, 'grad_norm': 0.020690683275461197, 'learning_rate': 4.8e-05, 'epoch': 1.08}
{'loss': 0.0009, 'grad_norm': 0.023781199008226395, 'learning_rate': 4.75e-05, 'epoch': 1.1}
{'loss': 0.049, 'grad_norm': 0.008982961066067219, 'learning_rate': 4.7e-05, 'epoch': 1.12}
{'loss': 0.0515, 'grad_norm': 0.018898945301771164, 'learning_rate': 4.6500000000000005e-05, 'epoch': 1.14}
{'loss': 0.1168, 'grad_norm': 0.036860279738903046, 'learning_rate': 4.600000000000001e-05, 'epoch': 1.16}
{'loss': 0.0043, 'grad_norm': 2.6860997676849365, 'learning_rate': 4.55e

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.014872394502162933, 'eval_runtime': 34.349, 'eval_samples_per_second': 58.226, 'eval_steps_per_second': 3.639, 'epoch': 2.0}
{'loss': 0.0004, 'grad_norm': 0.008482244797050953, 'learning_rate': 2.45e-05, 'epoch': 2.02}
{'loss': 0.0004, 'grad_norm': 0.005281701683998108, 'learning_rate': 2.4e-05, 'epoch': 2.04}
{'loss': 0.0004, 'grad_norm': 0.017311103641986847, 'learning_rate': 2.35e-05, 'epoch': 2.06}
{'loss': 0.0003, 'grad_norm': 0.004934926051646471, 'learning_rate': 2.3000000000000003e-05, 'epoch': 2.08}
{'loss': 0.0351, 'grad_norm': 0.006347563583403826, 'learning_rate': 2.25e-05, 'epoch': 2.1}
{'loss': 0.0003, 'grad_norm': 0.005840738769620657, 'learning_rate': 2.2000000000000003e-05, 'epoch': 2.12}
{'loss': 0.0003, 'grad_norm': 0.005816238932311535, 'learning_rate': 2.15e-05, 'epoch': 2.14}
{'loss': 0.0003, 'grad_norm': 0.0047832331620156765, 'learning_rate': 2.1e-05, 'epoch': 2.16}
{'loss': 0.0002, 'grad_norm': 0.004353512544184923, 'learning_rate': 2.05e-05, 'e

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.01495737861841917, 'eval_runtime': 32.886, 'eval_samples_per_second': 60.816, 'eval_steps_per_second': 3.801, 'epoch': 3.0}
{'train_runtime': 1178.7813, 'train_samples_per_second': 20.36, 'train_steps_per_second': 1.273, 'train_loss': 0.06059499291516841, 'epoch': 3.0}


TrainOutput(global_step=1500, training_loss=0.06059499291516841, metrics={'train_runtime': 1178.7813, 'train_samples_per_second': 20.36, 'train_steps_per_second': 1.273, 'total_flos': 3179217567744000.0, 'train_loss': 0.06059499291516841, 'epoch': 3.0})

In [36]:
# Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")



  0%|          | 0/125 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.01495737861841917, 'eval_runtime': 36.0316, 'eval_samples_per_second': 55.507, 'eval_steps_per_second': 3.469, 'epoch': 3.0}


In [37]:
# Save the model
model.save_pretrained('./fake_news_model')
tokenizer.save_pretrained('./fake_news_model')

('./fake_news_model\\tokenizer_config.json',
 './fake_news_model\\special_tokens_map.json',
 './fake_news_model\\vocab.txt',
 './fake_news_model\\added_tokens.json')

In [42]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model_path = './fake_news_model'
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Define the input text
input_text ="Republicans in Texas, including Gov. Greg Abbott, ripped into the new $44 billion White House disaster relief aid request as  inadequate.  It s been two months since the state was devastated by Hurricane Harvey and Donald Trump has failed on his promise to rebuild Texas. Abbott s criticism is strikingly different than the day after Trump visited Texas in the aftermath of the hurricane. His commitment was firm, strong and unequivocal,  Abbott said at the time.  That he was going to do everything he could to ensure that Texas will be restored as swiftly, as effectively as possible. But now, two months later, Republicans are calling the response  wholly inadequate,  according to the Dallas News.The White House disaster relief aid request falls well short of the demands made by officials from Texas, Florida and Puerto Rico.Greg Abbott said that the request  does not live up  to what Trump pledged in recovery aid. Abbott noted during a news conference that he s still reviewing the White House request but that it appears to be  completely inadequate.  What s more, Abbott said, it  does not live up  to what Trump has pledged in recovery aid, then he said that Washington worked faster for victims of Superstorm Sandy than for Harvey. Superstorm Sandy hit in 2012 during the Obama administration, by the way. The president has told me privately what he said publicly, and that is he wants to be the builder president. The president has said he wants this to be the best recovery from a disaster ever,  Abbott said.It s not just Abbott. Texas Sen. John Cornyn, the No. 2 Republican, blasted the request as  wholly inadequate.  A chorus of Texas lawmakers slammed it as insufficient.Houston Rep. John Culberson, a Republican and appropriations committee member, ripped Trump s recovery efforts, calling the request a  complete lack of understanding of the fundamental needs of Texans  and said it is a  nightmare  for Harvey survivors.Democrats, too, including Senate Minority Leader Chuck Schumer"

inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Make predictions
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted label
predictions = torch.argmax(outputs.logits, dim=-1)

# Interpret the results
label = predictions.item()
if label == 1:
    print("The text is classified as Fake News.")
else:
    print("The text is classified as True News.")

The text is classified as Fake News.


In [5]:
df

Unnamed: 0,title,text,subject,date,label
0,"House panel subpoenas New York, Massachusetts ...",WASHINGTON (Reuters) - A U.S. House of Represe...,politicsNews,"July 13, 2016",0
1,Syria investigator del Ponte signs off with a ...,GENEVA (Reuters) - Veteran prosecutor Carla de...,worldnews,"September 18, 2017",0
2,Phoenix Mayor Calls On Justice Department To ...,Phoenix mayor Greg Stanton is calling on the U...,News,"March 24, 2016",1
3,WATCH: 5 Straight Minutes Of Donald Trump Lyi...,There s a lot of uncertainties in this electio...,News,"August 16, 2016",1
4,Marriott Hotel Shoots Back Response After Musl...,A Muslim activist group has pushed Marriott I...,politics,"Sep 22, 2017",1
...,...,...,...,...,...
44893,MUSLIM ACTIVISTS LAUNCH VOTER REGISTRATION DRI...,Who would ve guessed? A coalition of U.S.-base...,Government News,"Dec 23, 2015",1
44894,Jakarta closes hotel targeted by Islamists for...,JAKARTA (Reuters) - Indonesia s capital has sh...,worldnews,"October 30, 2017",0
44895,Democrat Hilariously Mocks Paul Ryan During H...,"On Wednesday, the Democratic party staged a po...",News,"June 22, 2016",1
44896,Trumpsters Launch Insane Conspiracy Theory Ab...,Senator John McCain (R-AZ) was treated at Walt...,News,"November 23, 2017",1


In [7]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("CUDA current device:", torch.cuda.current_device())
    print("CUDA device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: False
CUDA device count: 0
