In [2]:
import pandas as pd
gsp_f = pd.read_csv("datasets/gossipcop_fake.csv")
gsp_r = pd.read_csv("datasets/gossipcop_real.csv")
ptf_f = pd.read_csv("datasets/politifact_fake.csv")
ptf_r = pd.read_csv("datasets/politifact_real.csv")
liar = pd.read_csv("datasets/liar.csv")

In [3]:
gsp_f.head()

Unnamed: 0,id,news_url,title,tweet_ids
0,gossipcop-2493749932,www.dailymail.co.uk/tvshowbiz/article-5874213/...,Did Miley Cyrus and Liam Hemsworth secretly ge...,284329075902926848\t284332744559968256\t284335...
1,gossipcop-4580247171,hollywoodlife.com/2018/05/05/paris-jackson-car...,Paris Jackson & Cara Delevingne Enjoy Night Ou...,992895508267130880\t992897935418503169\t992899...
2,gossipcop-941805037,variety.com/2017/biz/news/tax-march-donald-tru...,Celebrities Join Tax March in Protest of Donal...,853359353532829696\t853359576543920128\t853359...
3,gossipcop-2547891536,www.dailymail.co.uk/femail/article-3499192/Do-...,Cindy Crawford's daughter Kaia Gerber wears a ...,988821905196158981\t988824206556172288\t988825...
4,gossipcop-5476631226,variety.com/2018/film/news/list-2018-oscar-nom...,Full List of 2018 Oscar Nominations – Variety,955792793632432131\t955795063925301249\t955798...


In [4]:
ptf_r.head()

Unnamed: 0,id,news_url,title,tweet_ids
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...
2,politifact333,https://web.archive.org/web/20080204072132/htt...,"Romney makes pitch, hoping to close deal : Ele...",
3,politifact4358,https://web.archive.org/web/20110811143753/htt...,Democratic Leaders Say House Democrats Are Uni...,
4,politifact779,https://web.archive.org/web/20070820164107/htt...,"Budget of the United States Government, FY 2008",89804710374154240\t91270460595109888\t96039619...


In [5]:
def preprocess_f(df):
    # Drop rows with missing id
    df = df.dropna(subset=['title'])
    # Drop unnecessary columns
    df = df.drop(columns=['id', 'news_url', 'tweet_ids'])
    # Append new label column with all zeros
    df['label'] = 0
    return df

gsp_f = preprocess_f(gsp_f)
ptf_f = preprocess_f(ptf_f)

In [6]:
gsp_f.head()

Unnamed: 0,title,label
0,Did Miley Cyrus and Liam Hemsworth secretly ge...,0
1,Paris Jackson & Cara Delevingne Enjoy Night Ou...,0
2,Celebrities Join Tax March in Protest of Donal...,0
3,Cindy Crawford's daughter Kaia Gerber wears a ...,0
4,Full List of 2018 Oscar Nominations – Variety,0


In [7]:
def preprocess_r(df):
    # Drop rows with missing id
    df = df.dropna(subset=['title'])
    # Drop unnecessary columns
    df = df.drop(columns=['id', 'news_url', 'tweet_ids'])
    # Append new label column with all zeros
    df['label'] = 1
    return df

gsp_r = preprocess_r(gsp_r)
ptf_r = preprocess_r(ptf_r)

In [8]:
ptf_r.head()

Unnamed: 0,title,label
0,National Federation of Independent Business,1
1,comments in Fayetteville NC,1
2,"Romney makes pitch, hoping to close deal : Ele...",1
3,Democratic Leaders Say House Democrats Are Uni...,1
4,"Budget of the United States Government, FY 2008",1


In [9]:
# Show the resulting dataframe
liar = liar.loc[:, ~liar.columns.str.contains('^Unnamed')]

label_dict = {
    "pants-fire": 0,
    "FALSE": 0,
    "barely-true": 0,
    "half-true": 0,
    "mostly-true": 0,
    "TRUE": 1
}

# Convert the string labels to integer labels
liar["label"] = liar["label"].apply(lambda x: label_dict[x])
print(liar.head(10))

                                               title  label
0  Says the Annies List political group supports ...      0
1  When did the decline of coal start? It started...      0
2  Hillary Clinton agrees with John McCain "by vo...      0
3  Health care reform legislation is likely to ma...      0
4  The economic turnaround started at the end of ...      0
5  The Chicago Bears have had more starting quart...      1
6  Jim Dunnam has not lived in the district he re...      0
7  I'm the only person on this stage who has work...      0
8  However, it took $19.5 million in Oregon Lotte...      0
9  Says GOP primary opponents Glenn Grothman and ...      0


In [10]:
# append the dataframes together
dataset = pd.concat([gsp_f, ptf_r, liar[liar['label']==1]], ignore_index=True)

# display the resulting dataframe
print(dataset.head())
print("True labelled news-" + str(len(dataset[dataset['label'] == 1])))
print("Fake labelled news-" + str(len(dataset[dataset['label'] == 0])))

                                               title  label
0  Did Miley Cyrus and Liam Hemsworth secretly ge...      0
1  Paris Jackson & Cara Delevingne Enjoy Night Ou...      0
2  Celebrities Join Tax March in Protest of Donal...      0
3  Cindy Crawford's daughter Kaia Gerber wears a ...      0
4      Full List of 2018 Oscar Nominations – Variety      0
True labelled news-2260
Fake labelled news-5323


In [11]:
from sklearn.utils import shuffle

# shuffle the rows of the df DataFrame randomly
dataset = shuffle(dataset, random_state=42)
# reset the indexes
dataset.reset_index(drop=True, inplace=True)
print(dataset.head())
print(dataset.tail(10))

                                               title  label
0                   Paris Jackson Lesbian Bombshell!      0
1  Ben Affleck Called a 'Pig' After Rumors Surfac...      0
2  Jennifer Aniston Forgives Brad Pitt & Angelina...      0
3  Christian Bale avoiding sun to prepare for Dra...      0
4  Under Hosni Mubaraks rule, Egypt received more...      1
                                                  title  label
7573  Rihanna Rumored To Have Broken Up With Her Bil...      0
7574  The majority of the Hispanic population and th...      1
7575                   This Labor Day, we need protests      1
7576  Are Nicole Kidman and Keith Urban headed for d...      0
7577  Brad Pitt 'offers apology' to Jennifer Aniston...      0
7578     Katie Holmes, Jamie Foxx Battling Over Prenup?      0
7579  Eva Longoria Has Never Looked Happier at 43: I...      0
7580  Gross Domestic Product Percent change from pre...      1
7581  Jennifer Garner's Trainer Launched a Wellness ...      0
7582  Says

In [12]:
dataset.shape

(7583, 2)

In [13]:
dataset.to_csv('fake_news_dataset.csv', index=False)

In [14]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
dataset = pd.read_csv("fake_news_dataset.csv")

def preprocess_text(text):
    # Remove all URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove all non-ASCII characters
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # Remove all special symbols
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert all text to lowercase
    text = text.lower()
    
    # Tokenize the text into words
    tokens = word_tokenize(text)
    
    # Remove all stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Join the tokens back into a single string
    text = ' '.join(filtered_tokens)
    
    return text

# Apply the preprocessing function to the title column
dataset["title"] = dataset["title"].apply(preprocess_text)

# Save the preprocessed dataset to a new CSV file
dataset.to_csv("preprocessed_dataset.csv", index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sujoydatta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sujoydatta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
import pandas as pd
import numpy as np
import torch
import transformers
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score

# Load the dataset
dataset = pd.read_csv("preprocessed_dataset.csv")

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(dataset["title"], dataset["label"], test_size=0.3)

# Tokenize the texts using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
train_texts = [str(text) for text in train_texts]
test_texts = [str(text) for text in test_texts]

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings["input_ids"]),
                                               torch.tensor(train_encodings["attention_mask"]),
                                               torch.tensor(train_labels.values))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings["input_ids"]),
                                              torch.tensor(test_encodings["attention_mask"]),
                                              torch.tensor(test_labels.values))

# Set the batch size and create the dataloaders
batch_size = 5
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2, output_attentions=False,
                                                      output_hidden_states=False)

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the optimizer and learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Train the model
epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print("Epoch:", epoch+1, "Training Loss:", avg_train_loss)

    # Evaluate the model on the test set
    model.eval()
    predictions = []
    true_labels = []
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        predictions.append(logits)
        true_labels.append(label_ids)

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    preds = np.argmax(predictions, axis=1)

    acc = accuracy_score(true_labels, preds)
    print("Epoch:", epoch+1, "Testing Accuracy:", acc)

: 

: 

In [None]:
!pip install translator

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import requests
import uuid
import json
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Add your key and endpoint
key = "b7d9de81f743451686502350c1e39daf"
endpoint = "https://api.cognitive.microsofttranslator.com"

# location, also known as region.
# required if you're using a multi-service or regional (not global) resource. It can be found in the Azure portal on the Keys and Endpoint page.
location = "centralindia"

path = '/translate'
constructed_url = endpoint + path

params = {
    'api-version': '3.0',
    'from': 'hi',
    'to': ['en']
}

headers = {
    'Ocp-Apim-Subscription-Key': key,
    # location required if you're using a multi-service or regional (not global) resource.
    'Ocp-Apim-Subscription-Region': location,
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
}


# Function to translate text using the Azure Translator API
def translate_text(text):
    body = [{
        'text': text
    }]        
    request = requests.post(constructed_url, params=params, headers=headers, json=body)
    response = request.json()
    return response[0]['translations'][0]['text']

# Take user input in Hindi/Telugu/Tamil language
input_text = input("Enter a news headline in Hindi: ")
print("The translated text to English is- \n")
print(translate_text(input_text))


# Translate the input to English using the Azure translator API
translated_text = translate_text(input_text)
translated_text = preprocess_text(translated_text)

Enter a news headline in Hindi: गेम थ्रोंस सीजन 7 का प्रीमियर आर्य सीन के उद्घाटन में दो प्रतिष्ठित स्टोरीलाइन किताबों को जोड़ता है
The translated text to English is- 

The premiere of Game Thrones Season 7 combines two iconic storyline books at the opening of The Arya Scene


In [None]:
# Load the trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
encoded_input = tokenizer(translated_text, padding=True, truncation=True, max_length=256, return_tensors='pt')

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

input_ids = encoded_input['input_ids'].to(device)
attention_mask = encoded_input['attention_mask'].to(device)

with torch.no_grad():
    output = model(input_ids, attention_mask=attention_mask)

logits = output.logits
prob = torch.softmax(logits, dim=1)
label_indices = prob.argmax(dim=1)

In [None]:
logits = output.logits
prob = torch.softmax(logits, dim=1)
fake_prob = prob[:, 0].item()

print(f"The input has {fake_prob * 100:.2f}% probability of being fake news.")

The input has 28.21% probability of being fake news.
