In [1]:
import pandas as pd

# Load datasets
df_real = pd.read_csv('True.csv')
df_real['RealNews?'] = True

df_fake = pd.read_csv('Fake.csv')
df_fake['RealNews?'] = False

# Combine data together
df = pd.concat([df_real, df_fake], ignore_index=True)

# Create a new column called documen containing info [title + text]
df['document'] = df[['title', 'text']].agg(' '.join, axis=1)
# Ignore the cases for symplicity
df['document'] = df['document'].apply(lambda x: x.lower())


In [2]:
# Check the head and the length of the dataset
print(df.head())
len(df)

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  RealNews?  \
0  December 31, 2017        True   
1  December 29, 2017        True   
2  December 31, 2017        True   
3  December 30, 2017        True   
4  December 29, 2017        True   

                         

44898

In [3]:
from sklearn.model_selection import train_test_split

# Split the df into a training set and a test set
df_train, df_test = train_test_split(df, test_size=0.2, shuffle=True)


In [4]:
import re
from collections import defaultdict
import numpy as np

# Function to tokenize documents
def tokenize(document):
    return re.split(r"\W+", document)

# Count words in each class
# Initiate dictionaries to store word count for each class
real_wordcount = defaultdict(int)
fake_wordcount = defaultdict(int)


# Initizalize the Variables to count the number of documents in each class
real_doccount = 0
fake_doccount = 0

# Populate the word counts
for _, row in df_train.iterrows():
    # Tokenize the document into individual words
    words = tokenize(row['document'])
    # Check if the document is labeled as real or fake
    # If true, then increment corresponding variable
    if row['RealNews?'] == True:
        real_doccount += 1
        for word in words:
            real_wordcount[word] += 1
    # If false
    else:
        fake_doccount += 1
        for word in words:
            fake_wordcount[word] += 1

# Calculate probabilities with Laplace smoothing
# Laplace smoothing is applied by adding 1 to the word count
# to avoid zero probability issues, and dividing by the total count of
# words in the class plus the vocabulary size

# Union the keys of both the dictionaries
unique_vocab = set(real_wordcount.keys()).union(set(fake_wordcount.keys()))
vocab_size = len(unique_vocab)
# Total words in real news
total_real_words = sum(real_wordcount.values())
# Total words in fake news
total_fake_words = sum(fake_wordcount.values())

#  Calculate the probability of a word given the class
def word_prob(word, real=True):
    if real:
        return (real_wordcount[word] + 1) / (total_real_words + vocab_size)
    else:
        return (fake_wordcount[word] + 1) / (total_fake_words + vocab_size)

# Document probability given class
def document_prob(doc, real=True):
    words = tokenize(doc)
    # Compute the logarithm of the prior probability of the class
    prob = np.log(real_doccount / (real_doccount + fake_doccount)) if real else np.log(fake_doccount / (real_doccount + fake_doccount))
    # Add the log probability of each word in the document
    for word in words:
        prob += np.log(word_prob(word, real))
    return prob


In [5]:
# Initialize the lists to store true and predicted labels
y_true = []
y_pred = []

# Iterate the test dataset to make predictions
for _, row in df_test.iterrows():
    # log probability of the document being real
    real_prob = document_prob(row['document'], real=True)
    # log probability of the document being fake
    fake_prob = document_prob(row['document'], real=False)
    # append the labels
    y_true.append(row['RealNews?'])
    y_pred.append(real_prob > fake_prob)

# Evaluate performance
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Precision: 0.9514607775477341
Recall: 0.9574074074074074
F1 Score: 0.9544248298142379
