In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split ,RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB


In [8]:
df_true=pd.read_csv("true.csv")
df_fake=pd.read_csv("fake.csv")


In [9]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [10]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [11]:
df_true.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [12]:
df_fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


In [13]:
# data preprocessing

In [14]:
df_fake['news']=0
df_true['news']=1

In [15]:
df_true.head()

Unnamed: 0,title,text,subject,date,news
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [16]:
df_merged=pd.concat([df_true,df_fake],axis=0)

In [17]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   news     44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [18]:
df_merged.columns

Index(['title', 'text', 'subject', 'date', 'news'], dtype='object')

In [19]:
df = df_merged.drop(["title", "subject","date"], axis = 1)

In [20]:
df.sample(frac=1)

Unnamed: 0,text,news
1859,WASHINGTON (Reuters) - Senator Elizabeth Warre...,1
17482,The Stock Market is setting record after recor...,0
15528,"PALERMO, Italy (Reuters) - Former Italian Prim...",1
20412,"Just remember, the woman who believes Hillary ...",0
8644,Ted Nugent woke up on the wrong side of the be...,0
...,...,...
11573,President Trump just pulled the rug out from u...,0
7397,A street performer based in New York City is g...,0
14094,Is anyone else getting sick and tired of heari...,0
10881,President Trump told the graduates that they a...,0


In [21]:
df.reset_index(inplace=True)

In [22]:
df.columns

Index(['index', 'text', 'news'], dtype='object')

In [23]:
df.drop(["index"], axis = 1, inplace = True)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    44898 non-null  object
 1   news    44898 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 701.7+ KB


In [25]:
import re
import string

In [26]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

In [27]:
df["text"] = df["text"].apply(clean_text)

In [28]:
x = df["text"]
y = df["news"]

In [29]:
tfidf_vector = TfidfVectorizer(stop_words='english', min_df=0.05)
x_tfidf_vector = tfidf_vector.fit_transform(x)

x = pd.DataFrame(x_tfidf_vector.toarray())

In [30]:
# Model building

In [31]:
x_train, x_test,y_train,y_test = train_test_split(x,y, test_size=0.2,random_state=1)


In [32]:
#Gaussian Naive Bayes (GNB)

In [33]:
gnb_model = GaussianNB()
gnb_model.fit(x_train,y_train)

In [34]:
y_pred = gnb_model.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print('Confusion Matrix\n', cnf_matrix)

acc = accuracy_score(y_test,y_pred) 
print('\nAccuracy score is ":---', acc)

clf_report = classification_report(y_test,y_pred)
print('\nClassification report is\n', clf_report)

Confusion Matrix
 [[3993  685]
 [ 198 4104]]

Accuracy score is ":--- 0.9016703786191537

Classification report is
               precision    recall  f1-score   support

           0       0.95      0.85      0.90      4678
           1       0.86      0.95      0.90      4302

    accuracy                           0.90      8980
   macro avg       0.90      0.90      0.90      8980
weighted avg       0.91      0.90      0.90      8980



In [35]:
y_pred = gnb_model.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred)
print('Confusion Matrix\n', cnf_matrix)

acc = accuracy_score(y_train,y_pred) 
print('\nAccuracy score is ":---', acc)

clf_report = classification_report(y_train,y_pred)
print('\nClassification report is\n', clf_report)

Confusion Matrix
 [[16224  2579]
 [  838 16277]]

Accuracy score is ":--- 0.9048666406815524

Classification report is
               precision    recall  f1-score   support

           0       0.95      0.86      0.90     18803
           1       0.86      0.95      0.91     17115

    accuracy                           0.90     35918
   macro avg       0.91      0.91      0.90     35918
weighted avg       0.91      0.90      0.90     35918



In [36]:
#  Multinomial Naive Bayes (MNB)

In [37]:
mnb_model= MultinomialNB()
mnb_model.fit(x_train,y_train)

In [38]:
y_pred = mnb_model.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print('Confusion Matrix\n', cnf_matrix)

acc = accuracy_score(y_test,y_pred) 
print('\nAccuracy score is', acc)

clf_report = classification_report(y_test,y_pred)
print('\nClassification report is\n', clf_report)

Confusion Matrix
 [[4286  392]
 [ 251 4051]]

Accuracy score is 0.9283964365256124

Classification report is
               precision    recall  f1-score   support

           0       0.94      0.92      0.93      4678
           1       0.91      0.94      0.93      4302

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980



In [39]:
y_pred = mnb_model.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred)
print('Confusion Matrix\n', cnf_matrix)

acc = accuracy_score(y_train,y_pred) 
print('\nAccuracy score is', acc)

clf_report = classification_report(y_train,y_pred)
print('\nClassification report is\n', clf_report)

Confusion Matrix
 [[17299  1504]
 [ 1131 15984]]

Accuracy score is 0.9266384542569185

Classification report is
               precision    recall  f1-score   support

           0       0.94      0.92      0.93     18803
           1       0.91      0.93      0.92     17115

    accuracy                           0.93     35918
   macro avg       0.93      0.93      0.93     35918
weighted avg       0.93      0.93      0.93     35918



In [40]:
#Bernoulli Naive Bayes (BNB)

In [41]:
bnb_model= BernoulliNB()
bnb_model.fit(x_train,y_train)

In [42]:
y_pred = bnb_model.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print('Confusion Matrix\n', cnf_matrix)

acc = accuracy_score(y_test,y_pred) 
print('\nAccuracy score is', acc)

clf_report = classification_report(y_test,y_pred)
print('\nClassification report is\n', clf_report)

Confusion Matrix
 [[4505  173]
 [  99 4203]]

Accuracy score is 0.9697104677060133

Classification report is
               precision    recall  f1-score   support

           0       0.98      0.96      0.97      4678
           1       0.96      0.98      0.97      4302

    accuracy                           0.97      8980
   macro avg       0.97      0.97      0.97      8980
weighted avg       0.97      0.97      0.97      8980



In [43]:
y_pred = bnb_model.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred)
print('Confusion Matrix\n', cnf_matrix)

acc = accuracy_score(y_train,y_pred) 
print('\nAccuracy score is ":---', acc)

clf_report = classification_report(y_train,y_pred)
print('\nClassification report is\n', clf_report)

Confusion Matrix
 [[18140   663]
 [  411 16704]]

Accuracy score is ":--- 0.9700985578261596

Classification report is
               precision    recall  f1-score   support

           0       0.98      0.96      0.97     18803
           1       0.96      0.98      0.97     17115

    accuracy                           0.97     35918
   macro avg       0.97      0.97      0.97     35918
weighted avg       0.97      0.97      0.97     35918



In [44]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(bnb_model, x_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracies:", scores)
print("Mean accuracy:", scores.mean())


Cross-validation accuracies: [0.96826281 0.97341314 0.9689588  0.96937213 0.96895448]
Mean accuracy: 0.9697922696716169


In [45]:
#  LogisticRegression

In [46]:
LR = LogisticRegression()
LR.fit(x_train,y_train)

In [47]:
y_pred = LR.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print('Confusion Matrix\n', cnf_matrix)

acc = accuracy_score(y_test,y_pred) 
print('\nAccuracy score is', acc)

clf_report = classification_report(y_test,y_pred)
print('\nClassification report is\n', clf_report)

Confusion Matrix
 [[4590   88]
 [  60 4242]]

Accuracy score is 0.9835189309576837

Classification report is
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      4678
           1       0.98      0.99      0.98      4302

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980



In [48]:
y_pred = LR.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred)
print('Confusion Matrix\n', cnf_matrix)

acc = accuracy_score(y_train,y_pred) 
print('\nAccuracy score is ":---', acc)

clf_report = classification_report(y_train,y_pred)
print('\nClassification report is\n', clf_report)

Confusion Matrix
 [[18528   275]
 [  223 16892]]

Accuracy score is ":--- 0.9861350854724651

Classification report is
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     18803
           1       0.98      0.99      0.99     17115

    accuracy                           0.99     35918
   macro avg       0.99      0.99      0.99     35918
weighted avg       0.99      0.99      0.99     35918



In [49]:
#Random Forest

In [50]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(random_state=50)
RFC.fit(x_train, y_train)


In [51]:
y_pred = RFC.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print('Confusion Matrix\n', cnf_matrix)

acc = accuracy_score(y_test,y_pred) 
print('\nAccuracy score is', acc)

clf_report = classification_report(y_test,y_pred)
print('\nClassification report is\n', clf_report)

Confusion Matrix
 [[4661   17]
 [   9 4293]]

Accuracy score is 0.9971046770601336

Classification report is
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4678
           1       1.00      1.00      1.00      4302

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [52]:
y_pred = RFC.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred)
print('Confusion Matrix\n', cnf_matrix)

acc = accuracy_score(y_train,y_pred) 
print('\nAccuracy score is ":---', acc)

clf_report = classification_report(y_train,y_pred)
print('\nClassification report is\n', clf_report)

Confusion Matrix
 [[18803     0]
 [    0 17115]]

Accuracy score is ":--- 1.0

Classification report is
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     18803
           1       1.00      1.00      1.00     17115

    accuracy                           1.00     35918
   macro avg       1.00      1.00      1.00     35918
weighted avg       1.00      1.00      1.00     35918



In [53]:
import joblib

In [54]:
joblib.dump(bnb_model, 'model_file.pkl')

['model_file.pkl']

In [55]:
joblib.dump(tfidf_vector, 'tfidf_file.pkl')

['tfidf_file.pkl']

In [56]:
def predict_news(news_text):
    new1=clean_text(news_text)
    text_vector = tfidf_vector.transform([new1])
    prediction = bnb_model.predict(text_vector)[0]
    if prediction == 1:
        return "✅ Real News"
    else:
        return "❌ Fake News"


In [57]:
news = '''Patna: The deadlock over seat sharing among opposition INDIA bloc partners continued even as the last date for filing nomination papers for first phase election to be held on Nov 6 ended on Friday.
RJD spokesperson Chitranjan Gagan said a total of 71 party candidates filed nominations for the first phase, including the party’s CM face Tejashwi Prasad Yadav; national general secretary Bhola Yadav; Mohammad Shahabuddin’s son Osama Shahab; ex-speaker Awadh Bihari Chaudhari; Alok Mehta, Lalit Yadav and Israel Mansuri—all former ministers, and senior leader Bhai Birendra.'''
print(predict_news(news))


✅ Real News


In [58]:
news='''Reporting on the first assassination attempt against Donald Trump, George Stephanopoulos and Martha Raddatz of ABC News claimed that Trump “contributed” to “violent rhetoric” because he said “it’s going to be a bloodbath” if “I don’t get elected.”

In fact, Trump didn’t use the term “bloodbath” as a call to violence but to describe the effects of Biden’s policies, especially on the auto industry.'''
print(predict_news(news))

❌ Fake News


In [59]:
import gradio as gr

model = joblib.load("model_file.pkl")              
vectorizer = joblib.load("tfidf_file.pkl")  

def predict_news(news_text):
    cleaned_text = clean_text(news_text)
    vector = vectorizer.transform([cleaned_text])
    prob = model.predict_proba(vector)[0]
    pred_label = model.predict(vector)[0]
    confidence = max(prob) * 100
    

    if pred_label == 1:
        return f"<div style='color:white; background-color:#4CAF50; padding:10px; border-radius:5px;'> Real News ({confidence:.2f}% confidence)</div>"
    else:
        return f"<div style='color:white; background-color:#F44336; padding:10px; border-radius:5px;'> Fake News ({confidence:.2f}% confidence)</div>"

with gr.Blocks() as demo:
    gr.Markdown("<h1 style='text-align:center; color:#333;'>Fake News Detector</h1>")
    news_input = gr.Textbox(label="Paste news text here", placeholder="Enter headline or paragraph...", lines=5)
    predict_btn = gr.Button("Predict News", variant="primary")
    result_output = gr.HTML(label="Prediction Result")  

    predict_btn.click(fn=predict_news, inputs=news_input, outputs=result_output)

demo.launch()



* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.


