In [1]:
import pandas as pd

# Load datasets
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

# Add labels
fake['label'] = 0
true['label'] = 1

# Merge datasets
combined = pd.concat([fake, true], axis=0)

# Handle missing values
combined.dropna(inplace=True)

# Combine title and text
combined['content'] = combined['title'] + ' ' + combined['text']

# Analyze class distribution
print(combined['label'].value_counts())

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2

# Text preprocessing
X = combined['content']
y = combined['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization and Feature Selection
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('select', SelectKBest(chi2, k=5000))
])

X_train_transformed = pipeline.fit_transform(X_train, y_train)
X_test_transformed = pipeline.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

# Train models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

predictions = []
for name, model in models.items():
    model.fit(X_train_transformed, y_train)
    preds = model.predict_proba(X_test_transformed)[:, 1]  # Probability of class 1
    predictions.append(preds)

# Ensemble (average predictions)
ensemble_preds = sum(predictions) / len(predictions)
ensemble_preds_class = (ensemble_preds > 0.5).astype(int)

# Evaluate ensemble
ensemble_report = classification_report(y_test, ensemble_preds_class, output_dict=True)
ensemble_accuracy = accuracy_score(y_test, ensemble_preds_class)
print(ensemble_report, ensemble_accuracy)

# Save the ensemble model
ensemble_results = pd.DataFrame({
    'True Label': y_test,
    'Ensemble Prediction': ensemble_preds_class,
    'Confidence': ensemble_preds
})


import joblib
import os
import numpy as np

# Create Ensemble Model Class
class EnsembleModel:
    def __init__(self, models):
        self.models = models  # Dictionary of models
    
    def fit(self, X, y):
        for model in self.models.values():
            model.fit(X, y)
    
    def predict_proba(self, X):
        predictions = np.zeros((X.shape[0], 2))  # Assuming binary classification
        for model in self.models.values():
            predictions += model.predict_proba(X)
        return predictions / len(self.models)
    
    def predict(self, X):
        proba = self.predict_proba(X)
        return (proba[:, 1] > 0.5).astype(int)
    def print_models(self):
        for name, model in self.models.items():
            print(f"{name}: {model}")

# Train the ensemble model
ensemble_model = EnsembleModel(models)
ensemble_model.fit(X_train_transformed, y_train)

# Directory to save model
output_dir = 'saved_models'
os.makedirs(output_dir, exist_ok=True)

# Save the ensemble model
joblib.dump(ensemble_model, f'{output_dir}/ensemble_model.pkl')

print("Ensemble model saved successfully.")


label
0    23481
1    21417
Name: count, dtype: int64


In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os

# ================================
# Configuration
# ================================

# Paths to datasets
FAKE_NEWS_PATH = 'Fake.csv'
TRUE_NEWS_PATH = 'True.csv'

# Output directory for saving models
OUTPUT_DIR = 'saved_models'

# Number of initial training samples
INITIAL_TRAIN_SIZE = 1000

# ================================
# 1. Define the Ensemble Model Class
# ================================

class EnsembleModel:
    def __init__(self, models):
        """
        Initialize the ensemble with a dictionary of models.
        """
        self.models = models  # Dictionary of models
    
    def fit(self, X, y):
        """
        Fit each model in the ensemble.
        """
        for model in self.models.values():
            model.partial_fit(X, y, classes=np.array([0, 1]))
    
    def predict_proba(self, X):
        """
        Predict probability estimates for each model and average them.
        """
        probas = []
        for model in self.models.values():
            if hasattr(model, "predict_proba"):
                proba = model.predict_proba(X)[:, 1]  # Probability of class 1
            else:
                raise AttributeError(f"Model {model} does not support predict_proba.")
            probas.append(proba)
        ensemble_proba = np.mean(probas, axis=0)
        return ensemble_proba
    
    def predict(self, X):
        """
        Predict class labels based on averaged probabilities.
        """
        ensemble_proba = self.predict_proba(X)
        return (ensemble_proba > 0.5).astype(int)

# ================================
# 2. Load and Preprocess Data
# ================================

def load_and_preprocess():
    # Load datasets
    fake = pd.read_csv(FAKE_NEWS_PATH)
    true = pd.read_csv(TRUE_NEWS_PATH)
    
    # Add labels
    fake['label'] = 0
    true['label'] = 1
    
    # Merge datasets
    combined = pd.concat([fake, true], axis=0)
    
    # Handle missing values
    combined.dropna(inplace=True)
    
    # Combine title and text
    combined['content'] = combined['title'] + ' ' + combined['text']
    
    # Shuffle data
    combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return combined

# ================================
# 3. Train Ensemble Model
# ================================

def train_ensemble(combined):
    # Split into initial training and streaming data
    X_initial = combined['content'].iloc[:INITIAL_TRAIN_SIZE]
    y_initial = combined['label'].iloc[:INITIAL_TRAIN_SIZE]
    
    X_stream = combined['content'].iloc[INITIAL_TRAIN_SIZE:]
    y_stream = combined['label'].iloc[INITIAL_TRAIN_SIZE:]
    
    # Define the preprocessing pipeline
    vectorizer = HashingVectorizer(
        n_features=2**20, 
        ngram_range=(1, 2), 
        alternate_sign=False
    )
    
    # Transform initial training data
    X_initial_transformed = vectorizer.transform(X_initial)
    
    # Initialize models
    model_lr = SGDClassifier(loss='log_loss', max_iter=1, tol=None, warm_start=True, random_state=42)
    model_nb = MultinomialNB()
    model_lr2 = SGDClassifier(loss='log_loss', max_iter=1, tol=None, warm_start=True, random_state=24)
    
    # Initialize ensemble
    ensemble = EnsembleModel(models={
        'LogisticRegression1': model_lr,
        'MultinomialNB': model_nb,
        'LogisticRegression2': model_lr2
    })
    
    # Train ensemble
    print("Training Ensemble Model...")
    ensemble.fit(X_initial_transformed, y_initial)
    
    # Evaluate ensemble
    print("\nInitial Ensemble Model Evaluation:")
    evaluate_ensemble(ensemble, vectorizer, X_initial, y_initial)
    
    return vectorizer, ensemble, X_stream, y_stream

# ================================
# 4. Evaluate Ensemble Model
# ================================

def evaluate_ensemble(ensemble, vectorizer, X, y):
    X_transformed = vectorizer.transform(X)
    ensemble_preds = ensemble.predict(X_transformed)
    
    # Classification Report
    print("Ensemble Classification Report:")
    print(classification_report(y, ensemble_preds))
    
    # Accuracy
    accuracy = accuracy_score(y, ensemble_preds)
    print(f"Ensemble Accuracy: {accuracy:.4f}")

# ================================
# 5. Save Models and Data
# ================================

def save_ensemble(vectorizer, ensemble):
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Save vectorizer and ensemble
    joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'hashing_vectorizer.pkl'))
    joblib.dump(ensemble, os.path.join(OUTPUT_DIR, 'ensemble_model.pkl'))
    
    print(f"\nEnsemble model and vectorizer saved to '{OUTPUT_DIR}' directory.")

def save_streaming_data(combined):
    streaming_data = combined.iloc[INITIAL_TRAIN_SIZE:].copy()
    streaming_data.to_csv(os.path.join(OUTPUT_DIR, 'streaming_data.csv'), index=False)
    print(f"Streaming data saved to '{OUTPUT_DIR}/streaming_data.csv'.")

def save_initial_training_data(combined):
    initial_train_data = combined.iloc[:INITIAL_TRAIN_SIZE].copy()
    initial_train_data.to_csv(os.path.join(OUTPUT_DIR, 'initial_training_data.csv'), index=False)
    print(f"Initial training data saved to '{OUTPUT_DIR}/initial_training_data.csv'.")

# ================================
# 6. Main Execution
# ================================

def main():
    print("Loading and preprocessing data...")
    combined = load_and_preprocess()
    
    print("Training ensemble model...")
    vectorizer, ensemble, X_stream, y_stream = train_ensemble(combined)
    
    print("Saving ensemble model and vectorizer...")
    save_ensemble(vectorizer, ensemble)
    
    print("Saving initial training data for future evaluation...")
    save_initial_training_data(combined)
    
    print("Saving streaming data for simulation...")
    save_streaming_data(combined)
    
    print("\nInitial training and setup completed successfully.")

if __name__ == '__main__':
    main()


Loading and preprocessing data...
Training ensemble model...
Training Ensemble Model...

Initial Ensemble Model Evaluation:
Ensemble Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       507
           1       0.99      0.99      0.99       493

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000

Ensemble Accuracy: 0.9910
Saving ensemble model and vectorizer...

Ensemble model and vectorizer saved to 'saved_models' directory.
Saving initial training data for future evaluation...
Initial training data saved to 'saved_models/initial_training_data.csv'.
Saving streaming data for simulation...
Streaming data saved to 'saved_models/streaming_data.csv'.

Initial training and setup completed successfully.


In [None]:
import pandas as pd
import joblib
import os
import time
from sklearn.metrics import accuracy_score
import numpy as np

# ================================
# Configuration
# ================================

# Directory where models are saved
OUTPUT_DIR = 'saved_models'

# Model file paths
VECTORIZER_PATH = os.path.join(OUTPUT_DIR, 'hashing_vectorizer.pkl')
ENSEMBLE_MODEL_PATH = os.path.join(OUTPUT_DIR, 'ensemble_model.pkl')

# Streaming data path
STREAMING_DATA_PATH = os.path.join(OUTPUT_DIR, 'streaming_data.csv')

# Update frequency (number of samples before saving models)
SAVE_INTERVAL = 1000

# ================================
# 1. Define the Ensemble Model Class (Same as in initial_training.py)
# ================================

class EnsembleModel:
    def __init__(self, models):
        """
        Initialize the ensemble with a dictionary of models.
        """
        self.models = models  # Dictionary of models
    
    def fit(self, X, y):
        """
        Fit each model in the ensemble.
        """
        for model in self.models.values():
            model.partial_fit(X, y, classes=np.array([0, 1]))
    
    def predict_proba(self, X):
        """
        Predict probability estimates for each model and average them.
        """
        probas = []
        for model in self.models.values():
            if hasattr(model, "predict_proba"):
                proba = model.predict_proba(X)[:, 1]  # Probability of class 1
            else:
                raise AttributeError(f"Model {model} does not support predict_proba.")
            probas.append(proba)
        ensemble_proba = np.mean(probas, axis=0)
        return ensemble_proba
    
    def predict(self, X):
        """
        Predict class labels based on averaged probabilities.
        """
        ensemble_proba = self.predict_proba(X)
        return (ensemble_proba > 0.5).astype(int)

# ================================
# 2. Load Ensemble Model
# ================================

def load_ensemble_model():
    ensemble = joblib.load(ENSEMBLE_MODEL_PATH)
    return ensemble

ensemble = load_ensemble_model()
print("Ensemble model loaded successfully.")

# ================================
# 3. Load Streaming Data
# ================================

def load_streaming_data():
    if not os.path.exists(STREAMING_DATA_PATH):
        print(f"Streaming data file '{STREAMING_DATA_PATH}' not found.")
        return pd.DataFrame(columns=['content', 'label'])
    streaming_data = pd.read_csv(STREAMING_DATA_PATH)
    print(f"Loaded streaming data: {streaming_data.shape[0]} samples.")
    return streaming_data

streaming_data = load_streaming_data()

# ================================
# 4. Load Initial Training Data for Evaluation
# ================================

def load_initial_training_data():
    initial_data_path = os.path.join(OUTPUT_DIR, 'initial_training_data.csv')
    if not os.path.exists(initial_data_path):
        print(f"Initial training data file '{initial_data_path}' not found. Skipping evaluation.")
        return None, None
    initial_data = pd.read_csv(initial_data_path)
    print(f"Loaded initial training data for evaluation: {initial_data.shape[0]} samples.")
    return initial_data['content'], initial_data['label']

X_initial_eval, y_initial_eval = load_initial_training_data()

# ================================
# 5. Load Vectorizer
# ================================

def load_vectorizer():
    vectorizer = joblib.load(VECTORIZER_PATH)
    return vectorizer

vectorizer = load_vectorizer()
print("Vectorizer loaded successfully.")

# ================================
# 6. Update Ensemble with Streaming Data
# ================================

def update_ensemble():
    for idx, row in streaming_data.iterrows():
        content = row['content']
        label = row['label']
        
        # Preprocess
        X_new = [content]
        X_transformed = vectorizer.transform(X_new)
        y_new = [label]
        
        # Update ensemble
        ensemble.fit(X_transformed, y_new)
        
        # Optionally, evaluate on a validation set
        if (idx + 1) % SAVE_INTERVAL == 0:
            if X_initial_eval is not None and y_initial_eval is not None:
                X_val_transformed = vectorizer.transform(X_initial_eval)
                ensemble_preds = ensemble.predict(X_val_transformed)
                
                accuracy = accuracy_score(y_initial_eval, ensemble_preds)
                print(f"After {idx + 1} samples, Ensemble Accuracy: {accuracy:.4f}")
            else:
                print(f"Processed {idx + 1} samples.")
            
            # Save updated ensemble model
            save_ensemble_model()
            print(f"Ensemble model updated and saved after {idx + 1} samples.")
        
        # Simulate real-time data arrival
        time.sleep(0.01)  # Adjust as needed for simulation speed
    
    # Save ensemble model after all updates
    save_ensemble_model()
    print("Streaming model updates completed and saved.")

# ================================
# 7. Save Updated Ensemble Model
# ================================

def save_ensemble_model():
    joblib.dump(ensemble, ENSEMBLE_MODEL_PATH)

# ================================
# 8. Main Execution
# ================================

def main():
    if streaming_data.empty:
        print("No streaming data to process.")
        return
    print("Starting live streaming model updates...")
    update_ensemble()

if __name__ == '__main__':
    main()


Ensemble model loaded successfully.
Loaded streaming data: 43898 samples.
Loaded initial training data for evaluation: 1000 samples.
Vectorizer loaded successfully.
Starting live streaming model updates...
After 1000 samples, Ensemble Accuracy: 0.9730
Ensemble model updated and saved after 1000 samples.
After 2000 samples, Ensemble Accuracy: 0.9830
Ensemble model updated and saved after 2000 samples.
After 3000 samples, Ensemble Accuracy: 0.9880
Ensemble model updated and saved after 3000 samples.
After 4000 samples, Ensemble Accuracy: 0.9700
Ensemble model updated and saved after 4000 samples.
After 5000 samples, Ensemble Accuracy: 0.9770
Ensemble model updated and saved after 5000 samples.
After 6000 samples, Ensemble Accuracy: 0.9820
Ensemble model updated and saved after 6000 samples.
After 7000 samples, Ensemble Accuracy: 0.9800
Ensemble model updated and saved after 7000 samples.
After 8000 samples, Ensemble Accuracy: 0.9820
Ensemble model updated and saved after 8000 samples.
Af