<a href="https://colab.research.google.com/github/prayanshgupta129/Fake-News-Prediction/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

from google.colab import drive
drive.mount('/content/drive')

!pip install scikit-learn pandas nltk gradio seaborn matplotlib

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import gradio as gr
import seaborn as sns
import matplotlib.pyplot as plt


nltk.download('stopwords')
nltk.download('punkt')

data = {
    'text': [
        "Breaking News: Scientists discover cure for all diseases!",
        "Local council approves new park development project.",
        "Shocking! Aliens landed in New York last night.",
        "Company X announces record profits for the quarter.",
        "Urgent: President makes astonishing statement on alien invasion.",
        "Weather forecast predicts heavy rainfall this weekend.",
        "Unbelievable: Global warming is a hoax, secret document reveals.",
        "New study shows benefits of daily exercise.",
        "You won't believe what this celebrity said about the moon landing!",
        "Government releases new economic policy details.",
        "Fact check: Global average temperature continues to rise according to latest climate reports.",
        "Exclusive: Bigfoot spotted hiking in the Himalayas, new photos emerge.",
        "Local elections results show high voter turnout.",
        "Warning: Drinking water causes cancer, leaked government memo confirms.",
        "Research finds meditation reduces stress levels significantly."
    ],
    'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0]
}
df = pd.DataFrame(data)

print("--- Sample Dataset ---")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nLabel Distribution:")
print(df['label'].value_counts())


stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)

print("\n--- Processed Text Samples ---")
print(df[['text', 'processed_text']].head())


X = df['processed_text']
y = df['label']

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(X)

print(f"\nTF-IDF Matrix Shape: {X_tfidf.shape}")


if len(df['label'].unique()) < 2:
    print("Warning: Dataset has less than 2 unique classes. Cannot perform stratified split.")
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
else:
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)


print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print("\n--- Model Training Complete ---")

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
target_names = ['Real', 'Fake']
try:
    report = classification_report(y_test, y_pred, target_names=target_names)
except ValueError:
    report = "Classification report could not be generated due to missing labels in test set."

print(f"\nModel Accuracy on Test Set: {accuracy:.4f}")
print("\nClassification Report:\n", report)

def predict_fake_news(news_text):
    if not news_text:
        return "Please enter some text.", "N/A", "N/A"

    processed_input = preprocess_text(news_text)
    input_tfidf = tfidf_vectorizer.transform([processed_input])

    prediction = model.predict(input_tfidf)[0]
    prediction_proba = model.predict_proba(input_tfidf)[0]

    result_text = "Fake News" if prediction == 1 else "Real News"
    confidence_real = prediction_proba[0] * 100
    confidence_fake = prediction_proba[1] * 100

    if prediction == 1:
        color = "red"
        status_message = "🔴 Likely FAKE NEWS!"
    else:
        color = "green"
        status_message = "🟢 Likely REAL NEWS."

    return (
        f"<h2 style='color:{color};'>{status_message}</h2>"
        f"<p>Confidence (Real News): <strong>{confidence_real:.2f}%</strong></p>"
        f"<p>Confidence (Fake News): <strong>{confidence_fake:.2f}%</strong></p>"
    )

print("\n--- Launching Gradio UI ---")

iface = gr.Interface(
    fn=predict_fake_news,
    inputs=gr.Textbox(
        lines=5,
        placeholder="Enter news text here...",
        label="News Article Text"
    ),
    outputs=gr.HTML(label="Prediction Result"),
    title="Fake News Prediction Model",
    description="Enter a news article or statement to predict if it's real or fake.",
    examples=[
        ["Scientists confirm Earth is flat."],
        ["Local library announces new summer reading program."],
        ["Secret alien base discovered on the Moon."],
        ["Company reports strong quarterly earnings."],
        ["Drinking water cures all diseases instantly."]
    ]
)


iface.launch(share=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


--- Sample Dataset ---
                                                text  label
0  Breaking News: Scientists discover cure for al...      1
1  Local council approves new park development pr...      0
2    Shocking! Aliens landed in New York last night.      1
3  Company X announces record profits for the qua...      0
4  Urgent: President makes astonishing statement ...      1

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    15 non-null     object
 1   label   15 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 372.0+ bytes
None

Label Distribution:
label
0    8
1    7
Name: count, dtype: int64

--- Processed Text Samples ---
                                                text  \
0  Breaking News: Scientists discover cure for al...   
1  Local council approves new park development pr...   
2    Shocking! Aliens landed 

