# Detecting Fake News with Python

In [49]:
from IPython.display import Image
Image('C:/Users/profi/Documents/Self Learning/ML/Detecting Fake News with Python/Detecting Fake News (Model).jpg')

<IPython.core.display.Image object>

In [79]:
# Source tutorial from https://data-flair.training/blogs/advanced-python-project-detecting-fake-news

import numpy as np
import pandas as pd
import itertools # creating iterators for efficient looping

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [51]:
news_DF = pd.read_csv('C:/Users/profi/Documents/Self Learning/ML/Detecting Fake News with Python/fake_or_real_news.csv', usecols=['Unnamed: 0','title', 'text','label'])
news_DF.shape # return DataFrame dimensionality

(6335, 4)

In [52]:
news_DF.head() # return first 5 rows

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [53]:
labels=news_DF.label # obtaining label column only
# OR labels=news_DF['label']
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [54]:
x_train, x_test, y_train, y_test = train_test_split(news_DF.text, labels, test_size=0.2, random_state=7)
# altering random_state can improve/worsen accuracy, therefore either:
# 1. commit to a fixed random state for everything
# 2. report the results as a range, and produce a confidence interval
# 3. stratify data to reduce how much the random seed affects the results
# If accuracy results vary wildly with random seed, then the model is not robust;
# improve method to better fit data.

TF: Term Frequency, which measures how frequently a term occurs in a document.

IDF: Inverse Document Frequency, which measures how important a term is. Thus we need to weigh down the frequent terms while scale up the rare ones.

EXAMPLE
Consider a document containing 100 words where the word cat appears 3 times. The term frequency (TF) for cat is then (3/100) = 0.03. Now, assume we have 10 million documents and the word cat appears in one thousand of these. Then, the inverse document frequency (IDF) is calculated as log(10,000,000/1,000) = 4. Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.

In [103]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

# Show TF-DF matrix
tfidf_train.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.04811322, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [81]:
# Get matrix feature names
tfidf_vectorizer.get_feature_names()

['00',
 '000',
 '0000',
 '000000031',
 '00000031',
 '000035',
 '00006',
 '0001',
 '0001pt',
 '000billion',
 '000ft',
 '000km',
 '000x',
 '001',
 '0011',
 '003',
 '004',
 '00684',
 '006s',
 '007',
 '007s',
 '008',
 '008s',
 '009',
 '0099',
 '00am',
 '00p',
 '00pm',
 '01',
 '010',
 '011',
 '013',
 '013c2812c9',
 '015',
 '016',
 '018',
 '01am',
 '02',
 '022',
 '027',
 '02714',
 '028',
 '02870',
 '02welcome',
 '03',
 '030',
 '032',
 '0325',
 '033',
 '034',
 '035',
 '037',
 '03747',
 '039',
 '03eb',
 '04',
 '040',
 '0400',
 '042',
 '047',
 '048',
 '049',
 '05',
 '050',
 '0509245d29',
 '053',
 '056',
 '058',
 '06',
 '0600',
 '062',
 '0640',
 '066',
 '068',
 '07',
 '0700',
 '071',
 '075',
 '0750',
 '076',
 '07dryempjx',
 '08',
 '080',
 '081',
 '082',
 '084',
 '0843',
 '085',
 '0851',
 '089',
 '0891',
 '09',
 '091',
 '093',
 '098263',
 '09am',
 '09pm',
 '0_65b67362bd',
 '0_jgdktlmn',
 '0_kvyhphja',
 '0a_merrill',
 '0b6njlny5j',
 '0d',
 '0fjjvowyhg8qtskiz',
 '0h4at2yetra17uxetni02ls2jeg0mty45jr

In [126]:
# Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

# Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test_a)
y_pred
score=accuracy_score(y_test_a,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 0.0%


In [60]:
#DataFlair - Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

# nice confusion matrix visualisation @ https://www.datacamp.com/community/tutorials/scikit-learn-fake-news

array([[590,  48],
       [ 41, 588]], dtype=int64)

In [76]:
for i in range(0,10):
    x_train, x_test, y_train, y_test = train_test_split(news_DF.text, labels, test_size=0.2, random_state=i)
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

    tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
    tfidf_test=tfidf_vectorizer.transform(x_test)
    pac=PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train,y_train)

    # Predict on the test set and calculate accuracy
    y_pred=pac.predict(tfidf_test)
    score=accuracy_score(y_test,y_pred)
    print(f'Accuracy: {round(score*100,2)}%')
    
# random_state=6 yields the highest accuracy of 94.55%

Accuracy: 93.21%
Accuracy: 93.92%
Accuracy: 93.84%
Accuracy: 93.45%
Accuracy: 93.37%
Accuracy: 94.55%
Accuracy: 94.08%
Accuracy: 92.82%
Accuracy: 93.69%
Accuracy: 93.37%


In [141]:
x_test_a=pd.Series('Declarations of false victory and a vacuum of federal leadership have undermined testing as experts warn reopening the US could result in disaster\
A broad coalition of US health systems has mobilized to ramp up coronavirus testing in a national effort on a scale not seen since the second world war. But declarations of false victory by the Trump administration and a vacuum of federal leadership have undermined the endeavor, leading experts to warn that reopening the US could result in a disaster.\
The missing six weeks: how Trump failed the biggest test of his life\
Interviews with agents on the frontlines of the coronavirus battle – lab directors, chemists, manufacturers, epidemiologists, academics and technologists – reveal as diverse an application of the legendary American ingenuity as the century has seen.\
Test kit manufacturers are running production lines around the clock to triple their output, and triple it again. A private healthcare institute in California has constructed a mega-lab to process thousands of tests daily and deliver the results by text message alert. In smaller labs across the country, microbiologists improvise each day to fill unpredictable supply chain gaps that might leave them without swabs one day, and without crucial chemicals the next.\
“It’s incredible what we’ve done together over a short period of time,” Donald Trump said at a White House briefing this week, praising his administration’s response to the pandemic.\
But analysts say that without centralized governance and coordination, the national effort remains a competing coalition of state and local outfits hampered by duplicated work, competition for supplies, siloed pursuits of non-transferable solutions and red tape that leaves some labs with testing backlogs and others with excess capacity.\
All of which leaves the US without a unified, coherent strategy for testing and contact tracing to contain a virus that does not respect state borders and has already killed more than 60,000 Americans.\
Without it, the imminent experiment of reopening the country could be catastrophic, warned Harvard epidemiologist Michael Mina in a conference call with reporters this week.\
“My concern is that we’ll end up right where we have been, with major cities having healthcare systems that get overrun quickly because of major outbreaks,” Mina said.\
I’m afraid we’ll just end up repeating the past, Michael Mina, Harvard epidemiologist\
\
Meanwhile, as states begin to relax social distancing measures, the Trump administration is spreading dangerous misinformation, denying persistent supply shortages, underestimating the number of Covid-19 cases and exaggerating the margin of safety conferred by the current volume of testing and contact-tracing, experts say.\
“We’ve done more than 200,000 tests in a single day,” Mike Pence said at a taskforce briefing this week, in which Trump touted testing as “one of the great assets that we have” in reopening the US.\
But at current testing levels, with only rudimentary plans for contact tracing for new cases, the US will be flying virtually blind as it reopens, said Glen Weyl, a technologist who co-authored a report issued by Harvard’s Safra Center for Ethics that calls for 5m tests a day by early June.\
“No, definitely not, you can’t open up with that number,” Weyl said of Pence’s announcement. “It’s not even remotely in the right ballpark. It’s off by a factor of 10.”\
A new Marshall plan\
Testing is one of the biggest challenges the coronavirus crisis poses. And as Asian countries that have succeeded in temporarily containing the virus have shown, testing strategy is entwined with the need for contact tracing and isolating confirmed and suspected Covid-19 patients.\
There are multiple categories of tests with multiple different modes for sampling, storage and transport. A test might detect the virus itself, detect traces of the virus or detect the body’s reaction to having had the virus. The experience of being tested could be different in each case. One patient might have his or her sinuses probed by a swab at a drive-thru, while another might spit in a tube at home and another give a blood sample at a clinic.\
Each test has a different degree of reliability, with different amounts of time and labor required to complete the boomerang curve of sample collection to testing to result report.\
“We have too many [brands of] tests, and now there are a lot of people who are committed to their tests and they run their tests on their platforms,” said Paul Reider, a renowned research chemist in the pharmaceuticals industry who teaches at Princeton University.\
 We have too many [brands of] tests, and now there are a lot of people who are committed to their tests\
Paul Reider, Princeton University\
“If we had an effective administration – this is where the federal government comes in – they could essentially turn around and say, ‘What we would like to do is, we want one test, maybe two, that are fast, that are accurate, that are scalable and transferable, .\
“You want a gold-standard test.”\
In the US, regulatory and administrative hurdles are everywhere, with clinics unable to send samples to private labs that might be out of their usual networks, a lack of protocols for reporting testing data, slow regulatory approval for the use of alternative testing materials, insufficient federal funding to support lab efforts and no central leadership steering the country’s massive laboratory apparatus.\
“We don’t have a system that’s ever been built for surveillance, for wide-scale population surveillance or wide-scale testing for people who aren’t presenting to the hospital or the clinic,” said Mina. “The demand is just so much larger than our system was built for.”\
The Trump administration’s response to this complicated thicket has been to declare the federal government a “supplier of last resort” and wish the states luck. “It’s pretty simple,” Trump has said. “They have tremendous capacity. We hope to be able to help out.”\
In an attempt to meet the demand they have encountered, lab scientists have improvised constantly, substituting materials where possible or stacking testing platforms from different manufacturers – Roche, Qiagen, Abbott, Hologic, DiaSorin – so that if one goes down another can take its place.\
The result is that labs have delivered an unprecedented number of tests in record time – but with a fraction of the potential efficiency that could be achieved through better coordination, said Reider.\
“If Jared Kushner wanted to do something decent, and Vice-President Pence, they could try to standardize and distribute nationally a global test,” said Reider. “At least make it available and let people choose if they want to use it.”\
The Harvard report called for the establishment of a “Pandemic Testing Board” “akin to the War Production Board that the United States created in World War II”. The director of the Center for Infectious Disease Research and Policy (CIDRAP) at the University of Minnesota calls for a new Marshall plan to stand up testing in the US.\
Advertisement\
But no efforts to create such a central authority are apparent, said Michael Osterholm, CIDRAP director, who described a shortage of reagents, or chemicals used in testing, on his Osterholm Update podcast this week.\
“We have had a number of our testing laboratories unable to get the needed reagents they could’ve and should’ve had to increase testing,” Osterholm said. “We really need a Marshall plan where the federal government and the private sector get together and decide what are the challenges, what can we do to quickly boost these reagents, what can we do to actually increase the reagent pool?”\
‘Running towards a moving target’\
Demand signal or no, some big private sector players have already moved aggressively. Early on in the crisis, Color, a private healthcare institute that does genomic testing in California, resolved to stand up a mega-lab that is now on the verge of processing 10,000 tests a day, with a goal of expanding that capacity by an order of magnitude, said Othman Laraki, CEO.\
The company has since partnered with the city of San Francisco to provide Covid-19 testing for all private-sector and nonprofit essential employees, as well as any resident with symptoms who cannot find testing elsewhere. Next-day results are delivered via email and text-message alerts.\
“Our thinking was that you needed to have a few massively scaled labs as opposed to having a big sprinkling of small-scale labs,” Laraki said. “We believe that’s the way to build the type of capacity that’s needed really to bring the country back to work.”\
In Minnesota, academics at the state university partnered with scientists at the Mayo clinic, one of the country’s premier labs, to deliver on a challenge by governor Tim Walz to stop coronavirus in the state with comprehensive testing and contact tracing.\
 We really need a Marshall plan where the federal government and the private sector get together\
Michael Osterholm, CIDRAP director\
“We just made the decision that we’re probably going to be on our own and that we need to be ready to care for our patients,” said Tim Schacker, vice-dean for research at the University of Minnesota and an architect of the project.\
As a first step, the scientists invented a molecular test “that was mostly independent of the supply chain problems”, Schacker said.\
Broken supply chains\
Robin Patel, the president of the American Society for Microbiology, said supply chain issues continue to represent a daily challenge for laboratories, from swabs to chemicals to materials used to extract viral RNA and amplify DNA.\
“The situation has changed, yes, but it’s a different situation every day, so using the word ‘improved’ is I don’t think appropriate,” she said.\
“This isn’t just an American situation. People throughout the world are dealing with the same issues. The supply chain we’re talking about is not just an American supply chain, it’s a worldwide supply chain.”\
The reopening\
To celebrate America’s reopening, Trump appears to be preparing to hit the road, with plans to visit warehouses and factory sites to advertise the economic comeback he has promised. “We built the greatest economy the world has ever seen,” Trump said this week. “And we’re going to do it again. And it’s not going to be that long. OK?”\
Polling indicates that a majority of Americans does not share Trump’s optimism. About two in three Americans think restrictions on restaurants, stores and other businesses are appropriate, and 16% on top of that wanted tighter restrictions, a poll this week from the Washington Post and the University of Maryland found.\
Top epidemiologists believe it’s possible that the US could get some kind of reprieve from the virus in the warmer months ahead. If that happens, the summer could feature the scenes Trump has dreamed about, of packed churches, humming factories, crowded beaches and sold-out flights.\
But Trump’s dream that the virus will simply disappear is just that – a dream, epidemiologists say.\
“I hope that over the course of the next few weeks to two months, we’re going to actually see a substantial reduction in transmission,” Osterholm said. “And if it does, it shouldn’t be interpreted that we won, or that somehow we’re in control.\
“I hope that the case numbers continue to decrease over time, but I’m also very, very aware that they’re coming back, and we just have to remember that.”')

tfidf_test_a=tfidf_vectorizer.transform(x_test_a)

pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

y_pred=pac.predict(tfidf_test_a)
y_pred # prediction for above article i.e. REAL (we have already tested the model accuracy to be 90%+ using test set above)


array(['REAL'], dtype='<U4')

In [157]:
x_test_a=pd.Series('April may have been a glorious month for the ASX 200 but May is turning out to be anything but with the first trading day of the new month alone stripping a whisker above 5% off the share market.\
A sea of red greeted investors on the first day of the month as the real-world economic effects of the COVID-19 pandemic overcame the momentum of an impressive 25.4% recovery since the low of March 23 and an 8.8% rise in April alone – the best month for decades years.\
Rather than cementing April’s gains, investors took profits off the table and hit the exits, cutting 276 points or 5.01% off the ASX 200 to close at 5245.9 points.')

tfidf_test_a=tfidf_vectorizer.transform(x_test_a)

pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

y_pred=pac.predict(tfidf_test_a)
y_pred # prediction for above article i.e. REAL (we have already tested the model accuracy to be 90%+ using test set above)


array(['FAKE'], dtype='<U4')