In [None]:
# Improved Fake News Detector with Better Similarity Search

import pandas as pd
import numpy as np
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pickle
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Download NLTK Resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

class ImprovedFakeNewsDetector:
    def __init__(self, max_features=10000, chunk_size=1000, sample_size=None):
        """
        Initialize the detector with improved similarity search

        Args:
            max_features: Maximum number of TF-IDF features
            chunk_size: Process similarity search in chunks
            sample_size: None to use full dataset, or number to sample
        """
        self.max_features = max_features
        self.chunk_size = chunk_size
        self.sample_size = sample_size
        self.stop_words = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()
        self.vectorizer = None
        self.model = None
        self.original_data = None
        self.original_tfidf = None

    def preprocess_text(self, text):
        """Improved text preprocessing with better handling"""
        if pd.isna(text) or text == '' or text is None:
            return ''

        try:
            text = str(text).lower()
            # Remove punctuation but keep spaces
            text = ''.join([char if char not in string.punctuation else ' ' for char in text])
            # Split and clean tokens
            tokens = [word.strip() for word in text.split() if word.strip()]
            # Lemmatize and remove stopwords
            tokens = [self.lemmatizer.lemmatize(word) for word in tokens
                     if word not in self.stop_words and len(word) > 2]
            return ' '.join(tokens)
        except Exception as e:
            print(f"Error preprocessing text: {e}")
            return ''

    def load_and_prepare_data(self, fake_csv_path, real_csv_path, text_column='text',
                            title_column='title', subject_column='subject', date_column='date'):
        """
        Load and prepare data with option to use full dataset for similarity
        """
        print("📊 Loading dataset...")

        # Load fake news data
        fake_df = pd.read_csv(fake_csv_path)
        fake_df['label'] = 0
        print(f"✅ Loaded {len(fake_df)} fake news articles")

        # Load real news data
        real_df = pd.read_csv(real_csv_path)
        real_df['label'] = 1
        print(f"✅ Loaded {len(real_df)} real news articles")

        # Combine both datasets
        df = pd.concat([fake_df, real_df], axis=0, ignore_index=True)

        # Ensure required columns exist
        required_columns = [text_column]
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

        # Handle optional columns
        if title_column not in df.columns:
            df[title_column] = 'No title available'
        if subject_column not in df.columns:
            df[subject_column] = 'No subject available'
        if date_column not in df.columns:
            df[date_column] = 'No date available'

        print(f"📈 Total dataset size: {len(df)} articles")

        # Clean data
        print("🧹 Cleaning data...")
        original_size = len(df)
        df = df.dropna(subset=[text_column])
        df = df[df[text_column].astype(str).str.strip() != '']
        print(f"Removed {original_size - len(df)} empty articles")

        # Store data for similarity matching
        if self.sample_size is None:
            print(f"📋 Using full dataset ({len(df)} articles) for similarity matching")
            self.original_data = df.copy()
        elif len(df) > self.sample_size:
            print(f"📋 Sampling {self.sample_size} articles for similarity matching (from {len(df)} total)")
            # Use stratified sampling to maintain class balance
            self.original_data = df.groupby('label').apply(
                lambda x: x.sample(min(len(x), self.sample_size // 2), random_state=42)
            ).reset_index(drop=True)
        else:
            self.original_data = df.copy()

        # Shuffle the full dataset for training
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)

        print("⚡ Preprocessing text...")
        # Preprocess text with progress bar
        tqdm.pandas(desc="Processing training text")
        df['text_processed'] = df[text_column].progress_apply(self.preprocess_text)

        print("⚡ Preprocessing similarity search text...")
        tqdm.pandas(desc="Processing similarity text")
        self.original_data['text_processed'] = self.original_data[text_column].progress_apply(self.preprocess_text)

        # Remove rows with empty processed text
        df = df[df['text_processed'].str.strip() != '']
        self.original_data = self.original_data[self.original_data['text_processed'].str.strip() != '']

        print(f"✅ Final training dataset size: {len(df)} articles")
        print(f"✅ Final similarity dataset size: {len(self.original_data)} articles")
        return df

    def train_model(self, df):
        """Train the model"""
        print("🤖 Training model...")

        X = df['text_processed']
        y = df['label']

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        print("🔤 Creating TF-IDF vectors...")
        self.vectorizer = TfidfVectorizer(
            min_df=2,
            max_df=0.95,
            max_features=self.max_features,
            ngram_range=(1, 2),
            stop_words='english',
            lowercase=True,
            dtype=np.float32
        )

        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        X_test_tfidf = self.vectorizer.transform(X_test)

        print(f"📊 TF-IDF Matrix shape: {X_train_tfidf.shape}")

        self.model = LogisticRegression(
            max_iter=1000,
            random_state=42,
            solver='liblinear',
            C=1.0
        )

        self.model.fit(X_train_tfidf, y_train)

        # Evaluate model
        y_pred = self.model.predict(X_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)

        print(f"✅ Model trained successfully!")
        print(f"🎯 Accuracy: {accuracy:.4f}")
        print("\n📊 Classification Report:")
        print(classification_report(y_test, y_pred))

        # Prepare similarity search vectors in chunks to avoid memory issues
        print("🔍 Preparing similarity search vectors...")
        if len(self.original_data) > 5000:
            print("Processing in chunks to avoid memory issues...")
            chunk_size = 1000
            tfidf_chunks = []
            for i in range(0, len(self.original_data), chunk_size):
                chunk = self.original_data['text_processed'].iloc[i:i+chunk_size]
                chunk_tfidf = self.vectorizer.transform(chunk)
                tfidf_chunks.append(chunk_tfidf)
            from scipy.sparse import vstack
            self.original_tfidf = vstack(tfidf_chunks)
        else:
            self.original_tfidf = self.vectorizer.transform(self.original_data['text_processed'])

        return accuracy

    def calculate_text_similarity(self, text1, text2):
        """Calculate similarity between two texts using simple word overlap"""
        if not text1 or not text2:
            return 0.0

        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())

        if len(words1) == 0 and len(words2) == 0:
            return 0.0
        if len(words1) == 0 or len(words2) == 0:
            return 0.0

        intersection = len(words1.intersection(words2))
        union = len(words1.union(words2))

        return intersection / union if union > 0 else 0.0

    def find_similar_articles_improved(self, user_input, top_n=7, similarity_threshold=0.01):
        """
        Improved similarity search with multiple methods and lower threshold
        """
        if self.original_data is None or self.original_tfidf is None:
            return []

        print(f"🔍 Searching through {len(self.original_data)} articles...")

        # Preprocess user input
        user_processed = self.preprocess_text(user_input)
        if not user_processed.strip():
            print("Warning: Empty processed input")
            return []

        # Calculate TF-IDF similarity
        user_tfidf = self.vectorizer.transform([user_processed])

        # Calculate similarities in chunks if dataset is large
        all_similarities = []
        chunk_size = 1000

        for i in range(0, len(self.original_data), chunk_size):
            end_idx = min(i + chunk_size, len(self.original_data))

            # TF-IDF similarity for chunk
            if hasattr(self.original_tfidf, 'getrow'):
                chunk_tfidf = self.original_tfidf[i:end_idx]
            else:
                chunk_tfidf = self.original_tfidf[i:end_idx]

            tfidf_similarities = cosine_similarity(user_tfidf, chunk_tfidf)[0]

            # Add to results
            for j, tfidf_sim in enumerate(tfidf_similarities):
                actual_idx = i + j

                # Also calculate simple text similarity as backup
                article_text = str(self.original_data.iloc[actual_idx].get('text', ''))
                text_sim = self.calculate_text_similarity(user_input, article_text)

                # Use the higher of the two similarities
                final_similarity = max(tfidf_sim, text_sim * 0.5)  # Weight text similarity less

                if final_similarity > similarity_threshold:
                    all_similarities.append((actual_idx, final_similarity, tfidf_sim, text_sim))

        # Sort by similarity
        all_similarities.sort(key=lambda x: x[1], reverse=True)

        print(f"Found {len(all_similarities)} articles above threshold {similarity_threshold}")

        # Get top N results
        top_results = all_similarities[:top_n]

        similar_articles = []
        for idx, final_sim, tfidf_sim, text_sim in top_results:
            row = self.original_data.iloc[idx]

            text = str(row.get('text', ''))
            text_preview = text[:400] + "..." if len(text) > 400 else text

            similar_articles.append({
                'index': idx,
                'original_row_index': row.name if hasattr(row, 'name') else idx,
                'similarity': final_sim,
                'tfidf_similarity': tfidf_sim,
                'text_similarity': text_sim,
                'title': str(row.get('title', 'No title available')),
                'text_preview': text_preview,
                'subject': str(row.get('subject', 'No subject available')),
                'date': str(row.get('date', 'No date available')),
                'label': row.get('label', 0)
            })

        return similar_articles

    def search_specific_row(self, user_input, target_row_index):
        """
        Debug function to check similarity with a specific row
        """
        if target_row_index >= len(self.original_data):
            print(f"Row {target_row_index} not found in similarity dataset")
            return None

        row = self.original_data.iloc[target_row_index]
        article_text = str(row.get('text', ''))

        # Preprocess
        user_processed = self.preprocess_text(user_input)
        article_processed = self.preprocess_text(article_text)

        # TF-IDF similarity
        user_tfidf = self.vectorizer.transform([user_processed])
        article_tfidf = self.vectorizer.transform([article_processed])
        tfidf_sim = cosine_similarity(user_tfidf, article_tfidf)[0][0]

        # Text similarity
        text_sim = self.calculate_text_similarity(user_input, article_text)

        print(f"Debug - Row {target_row_index}:")
        print(f"TF-IDF Similarity: {tfidf_sim:.6f}")
        print(f"Text Similarity: {text_sim:.6f}")
        print(f"Article length: {len(article_text)} chars")
        print(f"Processed length: {len(article_processed)} chars")
        print(f"Title: {row.get('title', 'No title')}")

        return {
            'tfidf_similarity': tfidf_sim,
            'text_similarity': text_sim,
            'article': row.to_dict()
        }

    def predict_with_similarity(self, user_input, debug_row=None):
        """Make prediction and find similar articles"""
        if self.model is None or self.vectorizer is None:
            raise ValueError("Model not trained yet!")

        # Debug specific row if requested
        if debug_row is not None:
            debug_info = self.search_specific_row(user_input, debug_row)
            print("Debug info:", debug_info)

        # Preprocess input
        user_processed = self.preprocess_text(user_input)

        if not user_processed.strip():
            return None, None, []

        # Make prediction
        user_tfidf = self.vectorizer.transform([user_processed])
        prediction = self.model.predict(user_tfidf)[0]
        prediction_proba = self.model.predict_proba(user_tfidf)[0]

        # Find similar articles with improved search
        similar_articles = self.find_similar_articles_improved(
            user_input, top_n=7, similarity_threshold=0.01
        )

        return prediction, prediction_proba, similar_articles

    def save_model(self, filepath):
        """Save trained model and vectorizer"""
        model_data = {
            'model': self.model,
            'vectorizer': self.vectorizer,
            'original_data': self.original_data,
            'original_tfidf': self.original_tfidf
        }
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"✅ Model saved to {filepath}")

    def load_model(self, filepath):
        """Load trained model and vectorizer"""
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)

        self.model = model_data['model']
        self.vectorizer = model_data['vectorizer']
        self.original_data = model_data['original_data']
        self.original_tfidf = model_data['original_tfidf']
        print(f"✅ Model loaded from {filepath}")

def main():
    """Main function with improved similarity search"""

    # Initialize detector - set sample_size=None to use full dataset
    detector = ImprovedFakeNewsDetector(
        max_features=15000,
        chunk_size=1000,
        sample_size=None  # Use full dataset for similarity
    )

    try:
        # Update these paths
        fake_csv_path = "/content/Fake.csv"
        real_csv_path = "/content/True.csv"

        df = detector.load_and_prepare_data(
            fake_csv_path=fake_csv_path,
            real_csv_path=real_csv_path,
            text_column='text',
            title_column='title',
            subject_column='subject',
            date_column='date'
        )

        # Train model
        accuracy = detector.train_model(df)

    except FileNotFoundError as e:
        print(f"❌ Error: {e}")
        return
    except Exception as e:
        print(f"❌ Error during training: {e}")
        return

    # Interactive prediction loop
    print("\n" + "="*80)
    print("🎯 IMPROVED FAKE NEWS DETECTOR")
    print("="*80)

    while True:
        try:
            user_input = input("\n📰 Enter news statement (or type 'quit' to exit): ")

            if user_input.lower().strip() in ['exit', 'quit']:
                print("👋 Thank you!")
                break

            # Check for debug command
            debug_row = None
            if user_input.startswith('debug:'):
                try:
                    debug_row = int(user_input.split(':')[1])
                    user_input = input("Enter the news statement to compare: ")
                except:
                    print("Invalid debug format. Use 'debug:ROW_NUMBER'")
                    continue

            # Make prediction
            prediction, prediction_proba, similar_articles = detector.predict_with_similarity(
                user_input, debug_row=debug_row
            )

            if prediction is None:
                print("⚠️  Unable to process input.")
                continue

            # Display results
            fake_confidence = prediction_proba[0] * 100
            real_confidence = prediction_proba[1] * 100

            print("\n" + "="*80)
            print("🔍 RESULTS")
            print("="*80)
            print(f"🎯 Prediction: {'✅ REAL' if prediction == 1 else '❌ FAKE'}")
            print(f"📊 Confidence: Real: {real_confidence:.1f}% | Fake: {fake_confidence:.1f}%")

            if similar_articles:
                print(f"\n🔗 TOP {len(similar_articles)} SIMILAR ARTICLES:")
                print("-"*80)

                for i, article in enumerate(similar_articles, 1):
                    print(f"\n📄 #{i} | Row Index: {article.get('original_row_index', 'N/A')}")
                    print(f"📊 Similarities - Combined: {article['similarity']:.4f} | TF-IDF: {article['tfidf_similarity']:.4f} | Text: {article['text_similarity']:.4f}")
                    print(f"🏷️  Title: {article['title']}")
                    print(f"🎯 Label: {'✅ REAL' if article['label'] == 1 else '❌ FAKE'}")
                    print(f"📝 Text: {article['text_preview']}")
                    print("-" * 40)
            else:
                print("\n⚠️  No similar articles found")

        except KeyboardInterrupt:
            print("\n👋 Goodbye!")
            break
        except Exception as e:
            print(f"❌ Error: {e}")
            continue

if __name__ == "__main__":
    main()

📊 Loading dataset...
✅ Loaded 23545 fake news articles
✅ Loaded 21417 real news articles
📈 Total dataset size: 44962 articles
🧹 Cleaning data...
Removed 673 empty articles
📋 Using full dataset (44289 articles) for similarity matching
⚡ Preprocessing text...


Processing training text: 100%|██████████| 44289/44289 [01:07<00:00, 659.58it/s]


⚡ Preprocessing similarity search text...


Processing similarity text: 100%|██████████| 44289/44289 [01:08<00:00, 643.63it/s]


✅ Final training dataset size: 44288 articles
✅ Final similarity dataset size: 44288 articles
🤖 Training model...
🔤 Creating TF-IDF vectors...
📊 TF-IDF Matrix shape: (35430, 15000)
✅ Model trained successfully!
🎯 Accuracy: 0.9871

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4575
           1       0.98      0.99      0.99      4283

    accuracy                           0.99      8858
   macro avg       0.99      0.99      0.99      8858
weighted avg       0.99      0.99      0.99      8858

🔍 Preparing similarity search vectors...
Processing in chunks to avoid memory issues...

🎯 IMPROVED FAKE NEWS DETECTOR

📰 Enter news statement (or type 'quit' to exit): Pakistan shot down an Indian Rafale fighter jet
🔍 Searching through 44288 articles...
Found 3730 articles above threshold 0.01

🔍 RESULTS
🎯 Prediction: ❌ FAKE
📊 Confidence: Real: 23.1% | Fake: 76.9%

🔗 TOP 7 SIMILAR ARTICLES:
----------------------