In [None]:
# ✅ Import Necessary Libraries
import pandas as pd
import numpy as np
import spacy
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ✅ Load SpaCy Model
import spacy
nlp = spacy.load("en_core_web_sm")

# ✅ Load Dataset
file_path = "/content/IFND.csv"  # Update path if needed
df = pd.read_csv(file_path, encoding='latin-1')

# ✅ Define Text & Label Columns
print("Dataset Columns:", df.columns)
text_column = 'Statement'  # Update if different
label_column = 'Label'  # Update if different

# ✅ Data Cleaning & Preprocessing with SpaCy
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces

    # Use SpaCy for tokenization and stopword removal
    doc = nlp(text)
    words = [token.text for token in doc if not token.is_stop and not token.is_punct]

    return ' '.join(words)

df[text_column] = df[text_column].astype(str).apply(clean_text)  # Apply text cleaning

# ✅ Convert Labels to Binary Format (if necessary)
if df[label_column].dtype == 'object':
    df[label_column] = df[label_column].astype('category').cat.codes  # Convert categorical labels to numbers

# ✅ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df[text_column], df[label_column], test_size=0.2, random_state=42
)

# ✅ Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# ✅ Train Decision Tree Classifier
clf = DecisionTreeClassifier(max_depth=50, random_state=42)
clf.fit(X_train_tfidf, y_train)

# ✅ Predictions & Evaluation
y_pred = clf.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Model Accuracy: {accuracy:.4f}")
print("\n✅ Classification Report:\n", classification_report(y_test, y_pred))

# ✅ Confusion Matrix Visualization
plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# ✅ Function to Predict on New Text
def predict_news(news_text):
    processed_text = clean_text(news_text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = clf.predict(vectorized_text)[0]
    label = "Fake News" if prediction == 1 else "Real News"
    return {"News Text": news_text, "Prediction": label}

# Example Usage
print("\n📰 Testing New News Headlines:")
examples = [
    "Aliens have landed in New York and are taking over the world!",
    "Scientists discover a secret portal to another dimension under the ocean.",
    "Government confirms that the Earth is actually flat!",
    "Celebrities are secretly robots, claims new shocking report.",
    "The moon landing was staged, new evidence suggests."
]

for example in examples:
    print("🔍", predict_news(example))
