To Do:
- Apply sentiment analysis
- news article -> model -> { fakeNews: false, sentiment: 0.23 }

In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

import nltk
nltk.download('subjectivity')
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer


[nltk_data] Downloading package subjectivity to
[nltk_data]     /Users/samuelseokyukim/nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/samuelseokyukim/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Read the data
df = pd.read_csv('news.csv')

# Get shape and head
df.shape
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
# Get the labels
labels = df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [4]:
# Split the dataset
x_train,x_test,y_train,y_test = train_test_split(df['text'], labels, test_size = 0.2, random_state = 7)

In [5]:
# Loop through test set and clean each article

cleaned_articles = []

for document in x_test :
    # Remove all the special characters
    noSpecial = re.sub(r'\W', ' ', document)

    # Remove all single characters
    #noSingle = re.sub(r'\s+[a-zA-Z]\s+', ' ', noSpecial)

    # Remove single characters from the start
    #noSingleFromStart = re.sub(r'\^[a-zA-Z]\s+', ' ', noSingle) 

    # Substituting multiple spaces with single space
    noMultiSpace = re.sub(r'\s+', ' ', noSpecial, flags = re.I)

    # Removing prefixed 'b'
    noPrefixed = re.sub(r'^b\s+', '', noMultiSpace)

    # Convert to Lowercase
    lowercased = noPrefixed.lower()
    
    cleaned_articles.append(lowercased)



In [6]:
# Another method of applying sentiment analysis (unfinished)

"""
n_instances = 100

# Gather a list of subjective words and a list of objective words
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories = 'subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories = 'obj')[:n_instances]]
len(subj_docs), len(obj_docs)

# Split subjective and objective instances 
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
training_docs = train_subj_docs + train_obj_docs
testing_docs = test_subj_docs + test_obj_docs

# Initialize sentiment analyzer
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

# Unigram word features handle negation
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq = 4)
print(len(unigram_feats))
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams = unigram_feats)

# Apply features to obtain a feature-value representation
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)

# Train the classifier on the training_set
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)

# Output evaluation results
for key, value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))
"""

"\nn_instances = 100\n\n# Gather a list of subjective words and a list of objective words\nsubj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories = 'subj')[:n_instances]]\nobj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories = 'obj')[:n_instances]]\nlen(subj_docs), len(obj_docs)\n\n# Split subjective and objective instances \ntrain_subj_docs = subj_docs[:80]\ntest_subj_docs = subj_docs[80:100]\ntrain_obj_docs = obj_docs[:80]\ntest_obj_docs = obj_docs[80:100]\ntraining_docs = train_subj_docs + train_obj_docs\ntesting_docs = test_subj_docs + test_obj_docs\n\n# Initialize sentiment analyzer\nsentim_analyzer = SentimentAnalyzer()\nall_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])\n\n# Unigram word features handle negation\nunigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq = 4)\nprint(len(unigram_feats))\nsentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams = unigram_feats)\n\n# Apply

In [7]:
# Accepts a list of strings and pandas dataframe
# Loop through each element in list or dataframe 
# Applies sentiment analysis
def sentiment_analysis(articles_list) :
    
    for article in articles_list :
        
        # Initialize sentiment intensity analyzer
        sentiment_analyzer = SentimentIntensityAnalyzer()
        
        # Print(article)
        results = sentiment_analyzer.polarity_scores(article)
        print('Sentiment analysis results: ')
        
        for k in sorted(results) :
            print('{0}: {1}, '.format(k, results[k]), end = '')
        print()
        
        print("Article was rated as", results['neg'] * 100, "% Negative")
        print("Article was rated as", results['neu'] * 100, "% Neutral")
        print("Article was rated as", results['pos'] * 100, "% Positive")
        print("Overall rating of the article:", end = ' ')
        
        # Determine if an article is overall positive, negative, or neutral
        if results['compound'] >= 0.05 :
            print("Positive")
 
        elif results['compound'] <= - 0.05 :
            print("Negative")
 
        else :
            print("Neutral")


In [8]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

# Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(x_train) 
tfidf_test = tfidf_vectorizer.transform(cleaned_articles)


In [9]:
# Initialize PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter = 50)
pac.fit(tfidf_train, y_train)

# Predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy of test: {round(score * 100, 2)}%')

Accuracy of test: 92.98%


In [10]:
# Build confusion matrix: [[true positive, false negative], [false positive, true negative]]
confusion_matrix(y_test, y_pred, labels = ['REAL', 'FAKE'])

array([[587,  42],
       [ 47, 591]])

In [11]:
# Test a random article from CNN
expected_y = ['FAKE']
x_test_1 = ['SEYMOUR, IN—Calling into question the 9-year-old amateur geologist’s taste and expertise, sources confirmed Thursday that local child Jacob Hiller could afford to be more discerning about which rocks were worth collecting. “I asked him what kind of rock this one was and he said ‘shiny’—shiny isn’t a rock type, moron,” said one source, who noted that the majority of the rocks in the boy’s collection came either from his backyard or the drainage ditch along the side of the road, neither of which seemed like particularly impressive dig sites. “Oh, this one is bigger than that one? Is that your hypothesis? Does he not realize this one is literally a chunk of concrete? Even the few cool-looking rocks he does own are fakes; they’re tumbled and dyed. He might know rocks can’t naturally be that shiny and magenta if he’d bothered to read one goddamn article on geology in his entire life.” At press time, the child had reportedly added an invaluable dinosaur fossil to the collection in his shirt.']
tfidf_test_1 = tfidf_vectorizer.transform(x_test_1)
actual_y = pac.predict(tfidf_test_1)
score_1 = accuracy_score(expected_y, actual_y)

if expected_y == actual_y :
    print("Results:", actual_y[0])

elif expected_y == ['FAKE'] and actual_y == ['REAL'] :
    print("Results are false positive: news is considered real when it isn't")

elif expected_y == ['REAL'] and actual_y == ['FAKE'] :
    print("Results are false negative: news is considered fake when it isn't")

print(f'Accuracy of the results: {round(score_1 * 100, 2)}%')

# Apply sentiment analysis
sentiment_analysis(x_test_1)


Results: FAKE
Accuracy of the results: 100.0%
Sentiment analysis results: 


NameError: name 'ss' is not defined

In [None]:
# Confusion matrix of the random article 
confusion_matrix(expected_y, actual_y, labels = ['REAL', 'FAKE'])

In [None]:
type(actual_y)