import kagglehub

# Download latest version
path = kagglehub.dataset_download("emineyetm/fake-news-detection-datasets")

print("Path to dataset files:", path)

In [2]:

import numpy as np
import pandas as pd

fake_news = pd.read_csv(r"Fake.csv")
true_news = pd.read_csv(r"True.csv")
fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [3]:
def prepare_df(df: pd.DataFrame):
    df["full_text"] = df["subject"].astype(str) + " " + df["date"].astype(str) + " " +df["title"].astype(str) + ' ' +df['text'].astype(str)
    df =df.drop(["title", "text", "date", "subject"], axis = 1)
    return df

In [4]:
fake_news = prepare_df(fake_news)
true_news = prepare_df(true_news)

In [5]:
true_news.head()

Unnamed: 0,full_text
0,"politicsNews December 31, 2017 As U.S. budget..."
1,"politicsNews December 29, 2017 U.S. military ..."
2,"politicsNews December 31, 2017 Senior U.S. Re..."
3,"politicsNews December 30, 2017 FBI Russia pro..."
4,"politicsNews December 29, 2017 Trump wants Po..."


In [6]:
fake_news["label"] = False
true_news["label"] = True
fake_news.head()

Unnamed: 0,full_text,label
0,"News December 31, 2017 Donald Trump Sends Out...",False
1,"News December 31, 2017 Drunk Bragging Trump S...",False
2,"News December 30, 2017 Sheriff David Clarke B...",False
3,"News December 29, 2017 Trump Is So Obsessed H...",False
4,"News December 25, 2017 Pope Francis Just Call...",False


In [7]:
complete_set = pd.concat([fake_news, true_news], ignore_index=True)
complete_set = complete_set.sample(frac=1, random_state = 42).reset_index(drop = True)
complete_set.head()

Unnamed: 0,full_text,label
0,"US_News February 13, 2017 Ben Stein Calls Out ...",False
1,"politicsNews April 5, 2017 Trump drops Steve ...",True
2,"politicsNews September 27, 2017 Puerto Rico e...",True
3,"News May 22, 2017 OOPS: Trump Just Accidental...",False
4,"politicsNews June 24, 2016 Donald Trump heads...",True


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, train_test_split

tfidf_pipeline = make_pipeline(
    TfidfVectorizer(max_features = 10000, ngram_range=(1,2)),
    SelectKBest(chi2, k = 2000),
    SVC(kernel = "rbf", random_state= 42)                
)

train_news, test_news, train_value, test_value = train_test_split(complete_set["full_text"], complete_set["label"], test_size = 2000, train_size = 15000, random_state = 42)


In [9]:


vectorizer = TfidfVectorizer(max_features = 10000, ngram_range = (1, 2))
vector_output = vectorizer.fit_transform(test_news)
selector = SelectKBest(chi2, k = 2000).fit(vector_output, test_value)
scores = selector.scores_
features = vectorizer.get_feature_names_out()

top_ids = np.argsort(scores)[::-1][:50]
for i in top_ids:
    print(features[i], scores[i])

reuters 25.83740840686404
said 21.04624320635149
video 19.48698337726775
you 17.786029457017857
politicsnews 16.017033767198818
said on 14.549340690364492
worldnews 14.289355362016693
hillary 13.712063824891274
china 11.828995330688855
minister 11.76331667998373
washington reuters 11.455034402532656
watch 10.262366259817846
via 9.966726468296184
korea 9.699187869406362
politics 9.64500113770373
com 9.29383970402585
news 9.249739066835598
north 9.002189526688401
image 8.609198125659205
pic 8.316739534993998
twitter com 8.3075598657539
reuters the 8.140381603556275
left news 8.0562785299106
this 8.047552290608714
pic twitter 7.968640816618622
north korea 7.952485374679835
featured image 7.781352288506863
black 7.718698042073729
featured 7.687101677870409
us 7.684982931850465
on wednesday 7.578777508780608
tax 7.3166613502218025
said the 7.274980142794549
just 7.273600174615739
image via 7.142434097500305
on tuesday 6.9385563670580925
prime minister 6.812410459498771
washington 6.77269341

In [10]:
tfidf_pipeline.fit(train_news, train_value)


0,1,2
,steps,"[('tfidfvectorizer', ...), ('selectkbest', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,score_func,<function chi...001FD3572F6A0>
,k,2000

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [11]:
from sklearn.metrics import accuracy_score
predicted_values = tfidf_pipeline.predict(test_news)
accuracy_score(test_value, predicted_values)

0.9945

In [None]:
from newspaper import Article
import requests

def extract_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return f"{article.publish_date} {article.title} {article.text}"

In [39]:
article = extract_article("https://www.bbc.com/news/articles/c80dpd00r4eo")
tfidf_pipeline.predict(np.array([article,]))

array([False])