In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('Fake News Data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


In [6]:
df['label'].value_counts()

1    37106
0    35028
Name: label, dtype: int64

In [7]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [8]:
df.isnull().sum()

title    558
text      39
label      0
dtype: int64

In [9]:
df = df.dropna()

In [10]:
import nltk
from nltk import bigrams
from nltk import FreqDist
from nltk.util import ngrams

from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk import pos_tag

### Lowercase, Remove Punctuation

In [11]:
#df['text'] = df['text'].str.lower()  #lower case
#df['text'] = df['text'].str.replace(r'\d+', '', regex=True)  # numbers
#df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)  #  punctuation
# keep relevant punctuation 

In [12]:
df['title'] = df['title'].str.lower()  #lower case
#df['text'] = df['text'].str.replace(r'\d+', '', regex=True)  # numbers
#df['title'] = df['title'].str.replace(r'[^\w\s]', '', regex=True)  #  punctuation
#df['title'] = df['title'].str.replace(r'[]', '', regex=True)  #  punctuation

df['title'] = df['title'].str.replace(r'[\[\]]', '', regex=True)

# keep relevant punctuation 

### Handled NAs, Tokenization

In [13]:
# Tokenize the text and title, then combine them for each row and pair with the label
#df['combined_text'] = df['title'].fillna('') + ' ' + df['text'].fillna('')
#documents = [(nltk.word_tokenize(text), label) for text, label in zip(df['combined_text'], df['label'])]

In [14]:
df['combined_text'] = df['title'].fillna('')

In [15]:
#df['combined_text'] = df['title'].fillna('') + ' ' + df['text'].fillna('')
tokens = [nltk.word_tokenize(text) for text in df['combined_text']]
labels = df['label'].tolist()

### Removing Stop Words

In [16]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

### Featuresets - POS + Bigrams

In [17]:
# Process documents and collect frequency distribution
all_words = []
all_bigrams = []
all_tags = []

for token_list in tokens:
    # Filter tokens in one go with a list comprehension
    words = [word for word in token_list if word not in stop_words and len(word) > 1]
    #words = [word for word in token_list if len(word) > 1]
    all_words.extend(words)
    # You only need to create bigrams if you're going to use them
    bigrams_list = list(ngrams(words, 2))
    all_bigrams.extend(bigrams_list)
    # Store POS tags once to avoid repeated computation
    pos_tags = pos_tag(words)
    all_tags.extend([tag for _, tag in pos_tags])

In [18]:
# Using FreqDist is faster if you feed the entire list of words/bigrams/tags at once
word_freq = FreqDist(all_words)
bigram_freq = FreqDist(all_bigrams)
tag_freq = FreqDist(all_tags)

In [19]:
# Select the top N most common words, bigrams, and POS tags
top_n = 2000
most_common_words = {word for word, _ in word_freq.most_common(top_n)}
most_common_bigrams = {bigram for bigram, _ in bigram_freq.most_common(top_n)}
most_common_tags = {tag for tag, _ in tag_freq.most_common(top_n)}

In [20]:
# Define the feature extraction function
def document_features(document, most_common_words, most_common_bigrams, most_common_tags):
    document_words = set(document)
    document_bigrams = set(ngrams(document, 2))
    document_tags = set(tag for _, tag in pos_tag(document))
    
    features = {}
    # Use intersection for efficient feature setting
    words_in_doc = document_words & most_common_words
    bigrams_in_doc = document_bigrams & most_common_bigrams
    tags_in_doc = document_tags & most_common_tags
    
    for word in words_in_doc:
        features[f'contains_word({word})'] = True
    for bigram in bigrams_in_doc:
        features[f'contains_bigram({bigram[0]} {bigram[1]})'] = True
    for tag in tags_in_doc:
        features[f'contains_pos({tag})'] = True
    
    return features

In [21]:
# Create your feature sets more efficiently
featuresets = [(document_features(d, most_common_words, most_common_bigrams, most_common_tags), c) for d, c in zip(tokens, labels)]


In [22]:
# train a classifier and report accuracy
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.89

In [24]:
from sklearn.metrics import classification_report

predictions = [classifier.classify(features) for features, label in test_set]
true_labels = [label for features, label in test_set]

report = classification_report(true_labels, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.82      0.88       474
           1       0.85      0.96      0.90       526

    accuracy                           0.89      1000
   macro avg       0.90      0.89      0.89      1000
weighted avg       0.90      0.89      0.89      1000



## ??!! accuracy increased after using just title and not text ...

### from 70-something to 87.5 (unigram+bigram+POS) w/o stop words

### 86.7 (unigram+bigram+POS) with stop words

### negation words included for all the above

### 88.1 (unigram+bigram+POS) with stop words and with punctuation (square brackets)
#### !! Square brackets are very indicative of fake news 

### 89 (unigram+bigram+POS) without stop words and with punctuation (square brackets)


### 89  (unigram+bigram+POS) without stop words and with removing only square brackets 

#### Not much difference between removing all punc vs just square brackets