To Do:
- Apply sentiment analysis
- news article -> model -> { fakeNews: false, sentiment: 0.23 }

In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

import nltk
nltk.download('subjectivity')
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer


[nltk_data] Downloading package subjectivity to
[nltk_data]     /Users/samuelseokyukim/nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/samuelseokyukim/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Read the data
df = pd.read_csv('news.csv')

# Get shape and head
df.shape
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
# Get the labels
labels = df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [4]:
# Split the dataset
x_train,x_test,y_train,y_test = train_test_split(df['text'], labels, test_size = 0.2, random_state = 7)

In [5]:
# Loop through test set and clean each article

cleaned_articles = []

for document in x_test :
    # Remove all the special characters
    noSpecial = re.sub(r'\W', ' ', document)

    # Remove all single characters
    #noSingle = re.sub(r'\s+[a-zA-Z]\s+', ' ', noSpecial)

    # Remove single characters from the start
    #noSingleFromStart = re.sub(r'\^[a-zA-Z]\s+', ' ', noSingle) 

    # Substituting multiple spaces with single space
    noMultiSpace = re.sub(r'\s+', ' ', noSpecial, flags = re.I)

    # Removing prefixed 'b'
    noPrefixed = re.sub(r'^b\s+', '', noMultiSpace)

    # Convert to Lowercase
    lowercased = noPrefixed.lower()
    
    cleaned_articles.append(lowercased)



In [6]:
# Another method of applying sentiment analysis (unfinished)

"""
n_instances = 100

# Gather a list of subjective words and a list of objective words
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories = 'subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories = 'obj')[:n_instances]]
len(subj_docs), len(obj_docs)

# Split subjective and objective instances 
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
training_docs = train_subj_docs + train_obj_docs
testing_docs = test_subj_docs + test_obj_docs

# Initialize sentiment analyzer
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

# Unigram word features handle negation
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq = 4)
print(len(unigram_feats))
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams = unigram_feats)

# Apply features to obtain a feature-value representation
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)

# Train the classifier on the training_set
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)

# Output evaluation results
for key, value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))
"""

"\nn_instances = 100\n\n# Gather a list of subjective words and a list of objective words\nsubj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories = 'subj')[:n_instances]]\nobj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories = 'obj')[:n_instances]]\nlen(subj_docs), len(obj_docs)\n\n# Split subjective and objective instances \ntrain_subj_docs = subj_docs[:80]\ntest_subj_docs = subj_docs[80:100]\ntrain_obj_docs = obj_docs[:80]\ntest_obj_docs = obj_docs[80:100]\ntraining_docs = train_subj_docs + train_obj_docs\ntesting_docs = test_subj_docs + test_obj_docs\n\n# Initialize sentiment analyzer\nsentim_analyzer = SentimentAnalyzer()\nall_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])\n\n# Unigram word features handle negation\nunigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq = 4)\nprint(len(unigram_feats))\nsentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams = unigram_feats)\n\n# Apply

In [7]:
# Accepts a list of strings and pandas dataframe
# Loop through each element in list or dataframe 
# Applies sentiment analysis
def sentiment_analysis(articles_list) :
    
    for article in articles_list :
        
        # Initialize sentiment intensity analyzer
        sentiment_analyzer = SentimentIntensityAnalyzer()
        
        # Print(article)
        results = sentiment_analyzer.polarity_scores(article)
        print('Sentiment analysis results: ')
        
        for k in sorted(results) :
            print('{0}: {1}, '.format(k, results[k]), end = '')
        print()
        
        print("Article was rated as", results['neg'] * 100, "% Negative")
        print("Article was rated as", results['neu'] * 100, "% Neutral")
        print("Article was rated as", results['pos'] * 100, "% Positive")
        print("Overall rating of the article:", end = ' ')
        
        # Determine if an article is overall positive, negative, or neutral
        if results['compound'] >= 0.05 :
            print("Positive")
 
        elif results['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")


In [8]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

# Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(x_train) 
tfidf_test = tfidf_vectorizer.transform(cleaned_articles)


In [40]:
# Initialize PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter = 50)
pac.fit(tfidf_train, y_train)

# Predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy of test: {round(score * 100, 2)}%')

Accuracy of test: 92.58%


In [41]:
# Build confusion matrix: [[true positive, false negative], [false positive, true negative]]
confusion_matrix(y_test, y_pred, labels = ['REAL', 'FAKE'])

array([[585,  44],
       [ 50, 588]])

In [42]:
# Test a random article from the Wall Street Journal
expected_y = ['REAL']
x_test_1 = ['Fishing is never an easy way to make a living, but it’s even harder when the federal bureaucracy can put you out of business on a whim. Herring fishermen are fighting back, with potentially large implications for the administrative state. In 2020 the National Marine Fisheries Service (NMFS) decreed that Atlantic herring fishermen must include a human monitor to ensure compliance with catch limits. The fishermen must pay for the monitor, though many are small, independent operators. By the fisheries service’s own estimate, its mandate costs each boat $710 a day. Depending on the catch, this is often more than the captains make on a trip. Even the service acknowledges that the monitoring cost will reduce herring operations’ annual returns by 20%. Enter the Cause of Action Institute, which sued the Commerce Department (which oversees the NMFS) in 2020 on behalf of fishermen in New Jersey. The suit claims Congress never authorized the bureaucratic cost-shift to the industry. The 1976 Magnuson-Stevens Act governs fishing in federal waters and allows the fisheries service to impose monitors. But nowhere in the law does Congress give the service power to force herring fishermen to fund the program. Congress has authorized industry-funded monitoring in other, specific contexts—including for certain North Pacific fisheries, and for foreign vessels. It did not do the same here. The fisheries service argues it has the legal right to impose the costs because the law is silent on the matter. The government points to language authorizing it to take steps that are “necessary and appropriate” to manage fisheries. Put another way, unless Congress explicitly prohibits an action, an agency can proceed. The case has moved up the appellate chain, and in August a divided panel of the D.C. Circuit Court of Appeals ruled for the government. Judges Sri Srinivasan and Judith Rogers cited the Supreme Court’s Chevron deference standard, finding that the law “through its silence, leaves room for agency discretion.” But Judge Justin Walker noted in dissent that this is a principle with no limit. He mused whether the fisheries service could find it “necessary” for fishermen to drive their federal monitors back and forth to the office to save on government gas bills, or whether it could demand fishermen finance other bureaucratic costs. The decision removes one of the few practical constraints on regulatory excess: a lack of resources. If government can write rules and require their targets to pay the costs without explicit Congressional approval, there will be no limit on bureaucratic discretion. The herring fishermen are asking the Supreme Court to hear the case and are represented by former Solicitor General Paul Clement. In West Virginia v. EPA this year, the Court used its major questions doctrine to rein in egregious regulatory overreach. But it has left Chevron as a largely unchecked license for regulators who can still do great harm without Congressional assent. Silence shouldn’t be a bureaucratic license to wreck livelihoods.']
tfidf_test_1 = tfidf_vectorizer.transform(x_test_1)
actual_y = pac.predict(tfidf_test_1)
score_1 = accuracy_score(expected_y, actual_y)

if expected_y == actual_y :
    print("Results:", actual_y[0])

elif expected_y == ['FAKE'] and actual_y == ['REAL'] :
    print("Results are false positive: news is considered real when it isn't")

elif expected_y == ['REAL'] and actual_y == ['FAKE'] :
    print("Results are false negative: news is considered fake when it isn't")

print(f'Accuracy of the results: {round(score_1 * 100, 2)}%')

# Apply sentiment analysis
sentiment_analysis(x_test_1)


Results: REAL
Accuracy of the results: 100.0%
Sentiment analysis results: 
compound: -0.6524, neg: 0.075, neu: 0.866, pos: 0.059, 
Article was rated as 7.5 % Negative
Article was rated as 86.6 % Neutral
Article was rated as 5.8999999999999995 % Positive
Overall rating of the article: Negative


In [43]:
# Confusion matrix of the random article from the Wall Street Journal
confusion_matrix(expected_y, actual_y, labels = ['REAL', 'FAKE'])

array([[1, 0],
       [0, 0]])

In [44]:
# Test a random article from CNN
expected_y_2 = ['REAL']
x_test_2 = ["US stocks exploded higher Wednesday. The Dow is now more than 20% above its 52-week low, which puts it in a new bull market. Federal Reserve chair Jerome Powell strongly suggested that the central bank is ready to slow its pace of interest rate hikes. Powell noted that the Fed is still concerned about inflation but that it also does not want to jeopardize the health of the labor market and broader economy either. Wednesday's rally helped push the markets to their second straight month of solid returns. The three major indexes wrapped up November with gains between 4% and 6%. The Dow soared more than 735 points, or 2.2% The S&P 500 rose 3.1%. The Nasdaq Composite shot up 4.4%. As stocks settle after the trading day, levels might still change slightly."]
tfidf_test_2 = tfidf_vectorizer.transform(x_test_2)
actual_y_2 = pac.predict(tfidf_test_2)
score_2 = accuracy_score(expected_y_2, actual_y_2)

if expected_y_2 == actual_y_2 :
    print("Results:", actual_y_2[0])

elif expected_y_2 == ['FAKE'] and actual_y_2 == ['REAL'] :
    print("Results are false positive: news is considered real when it isn't")

elif expected_y_2 == ['REAL'] and actual_y_2 == ['FAKE'] :
    print("Results are false negative: news is considered fake when it isn't")

print(f'Accuracy of the results: {round(score_2 * 100, 2)}%')

# Apply sentiment analysis
sentiment_analysis(x_test_2)


Results are false negative: news is considered fake when it isn't
Accuracy of the results: 0.0%
Sentiment analysis results: 
compound: 0.8302, neg: 0.021, neu: 0.889, pos: 0.091, 
Article was rated as 2.1 % Negative
Article was rated as 88.9 % Neutral
Article was rated as 9.1 % Positive
Overall rating of the article: Positive


In [45]:
# We have a false negative. Confusion matrix of the random article 
confusion_matrix(expected_y_2, actual_y_2, labels = ['REAL', 'FAKE'])

array([[0, 1],
       [0, 0]])

In [46]:
# Test a random fake news article from the onion
expected_y_3 = ['FAKE']
x_test_3 = ['NEW YORK—Emphasizing that the researchers were by no means happy about these results, an unfortunate study published by Columbia University this week found that abusing restaurant waitstaff is the secret to living a longer, happier life. “After tracking thousands of individuals over a period of two decades, we can say with regrettable certainty that treating food service workers like subhuman garbage is the single most important factor in enhancing the length and quality of a human lifespan,” said the study’s lead author Dr. Elizabeth Mitran, who reluctantly explained that just 30 seconds of screaming at a waiter for slow entrées carries greater benefits than an hour of vigorous exercise.“We can’t in good conscience endorse this behavior, but every waiter you make cry adds another year to your life.”']
tfidf_test_3 = tfidf_vectorizer.transform(x_test_3)
actual_y_3 = pac.predict(tfidf_test_3)
score_3 = accuracy_score(expected_y_3, actual_y_3)

if expected_y_3 == actual_y_3 :
    print("Results:", actual_y_3[0])

elif expected_y_3 == ['FAKE'] and actual_y_3 == ['REAL'] :
    print("Results are false positive: news is considered real when it isn't")

elif expected_y_3 == ['REAL'] and actual_y_3 == ['FAKE'] :
    print("Results are false negative: news is considered fake when it isn't")

print(f'Accuracy of the results: {round(score_3 * 100, 2)}%')

# Apply sentiment analysis
sentiment_analysis(x_test_3)


Results: FAKE
Accuracy of the results: 100.0%
Sentiment analysis results: 
compound: -0.1036, neg: 0.107, neu: 0.775, pos: 0.118, 
Article was rated as 10.7 % Negative
Article was rated as 77.5 % Neutral
Article was rated as 11.799999999999999 % Positive
Overall rating of the article: Negative


In [47]:
# Confusion matrix of the fake news article
confusion_matrix(expected_y_3, actual_y_3, labels = ['REAL', 'FAKE'])

array([[0, 0],
       [0, 1]])