# Imports

In [1]:
#%pip install newspaper3k
import pandas as pd
import numpy as np
import itertools
import urllib
import newspaper
import random
import requests 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Data

In [2]:
df = pd.read_csv('news.csv')

# Check the shape
df.shape

(6335, 4)

In [3]:
# Look at first 10 rows
df.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [4]:
# Get all the labels into variable
labels = df.label

In [5]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=10)

# Model

In [6]:
# Init vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform trainset
tfidf_train = vectorizer.fit_transform(x_train)
# Transform testset
tfidf_test = vectorizer.transform(x_test)

In [7]:
# Init PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
# Fit PAC
pac.fit(tfidf_train,y_train)
# Predict testset
y_pred = pac.predict(tfidf_test)
# Check score
score = accuracy_score(y_test, y_pred)
# Print score
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 94.87%


# Visualize

In [8]:
# Confusion matrix
c_mat=confusion_matrix(y_test, y_pred, labels=['FAKE','REAL'])

In [9]:
data = {'Fake':[c_mat[0][0], c_mat[1][0]], 'Real':[c_mat[0][1], c_mat[1][1]]} 
result= pd.DataFrame(data, index =['Fake', 'Real'])
print(result)

      Fake  Real
Fake   576    27
Real    38   626


# Test with article

In [10]:
# For fun take a random article from the internet and check if it is classified as real or fake
# Choose news vendor
vendor = 'https://cnn.com/'
paper = newspaper.build(vendor)

# Init empty list for URLs
article_urls = []

# We want only articles not categories etc
for article in paper.articles:
    # Print URL
    print(article.url)
    # On CNN all articles end with 'index.html' therefor we check if url contains it
    if 'index.html' in article.url:
        article_urls.append(article.url)

http://arabic.cnn.com/health/video/2020/04/03/v87198-nurses-new-york-coronavirus-plea
http://cnn.com/2020/04/03/uk/thomas-harvey-uk-coronavirus-intl-gbr/index.html
http://cnn.com/travel/article/airplane-cruise-hygiene-future/index.html
http://cnn.com/travel/article/us-passport-emergencies/index.html
http://cnn.com/travel/article/doomsday-dinner-party-ardyn/index.html
http://cnn.com/travel/article/eleven-madison-park-reopens-for-first-responders-trnd/index.html


In [11]:
# Get a random int (between 0 and the length of our article list) to get an article
art_num = random.randint(0,len(article_urls))
# Get article url
art_url = article_urls[art_num]

# Check the URL of our article
print(art_url)

http://cnn.com/travel/article/doomsday-dinner-party-ardyn/index.html


In [12]:
# Get the article
article = newspaper.Article(art_url)
# Download article
article.download()
# Parse article
article.parse()
text = [article.text]

In [13]:
# Vectorize the article
article_test = vectorizer.transform(text)

In [14]:
# Predict if it is fake or real news
article_pred = pac.predict(article_test)

In [15]:
# Print our prediction
print(article_pred)

['FAKE']
