In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Create directory if not exists
output_folder = 'zero'
os.makedirs(output_folder, exist_ok=True)

# Function to preprocess text data
def preprocess_text(text):
    # Remove URLs, special characters, numbers, and make lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    
    # Tokenize and remove stopwords
    tokens = text.split()
    cleaned_tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    
    # Join tokens back to string
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

# Function to load and preprocess training dataset
def load_training_data():
    # Load training dataset
    train_val_dataset = pd.read_csv('/mnt/data/youtube_spam.csv')  # Load the uploaded training and validation dataset

    # Preprocess 'CONTENT' column
    train_val_dataset['clean_content'] = train_val_dataset['CONTENT'].apply(preprocess_text)
    
    # Use only 'CONTENT' as features and 'CLASS' as the target variable
    X_train_val_raw = train_val_dataset['clean_content']  # Features (content of the comments)
    y_train_val = train_val_dataset['CLASS']        # Target variable (spam or not spam)

    return X_train_val_raw, y_train_val

# Function to perform feature extraction using TF-IDF
def feature_extraction(X_train_val_raw):
    # Convert text data to numerical data using TF-IDF Vectorizer
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train_val = tfidf.fit_transform(X_train_val_raw)
    return X_train_val, tfidf

# Function to load and preprocess testing dataset
def load_testing_data(tfidf):
    # Load and preprocess the testing dataset
    test_dataset = pd.read_csv('/mnt/data/youtube_comments_cleaned.csv')  # Load the testing dataset
    test_dataset['Comment'] = test_dataset['Comment'].apply(preprocess_text)  # Preprocess 'Comment' column
    X_test_raw = test_dataset['Comment']  # Use the cleaned comments for testing
    X_test = tfidf.transform(X_test_raw)  # Transform the test dataset using the same TF-IDF Vectorizer

    return X_test

# Function to initialize models
def initialize_models():
    return {
        'Naive Bayes': GaussianNB(),
        'SVM (linear kernel)': SVC(kernel='linear', probability=True),
        'SVM (poly kernel)': SVC(kernel='poly', probability=True),
        'SVM (rbf kernel)': SVC(kernel='rbf', probability=True),
        'SVM (sigmoid kernel)': SVC(kernel='sigmoid', probability=True),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
    }

# Function to train and validate models
def train_and_validate_models(models, X_train, X_val, y_train, y_val, test_size):
    validation_results = []
    metrics_results = []

    print("\nTraining and Validation Phase\n")
    for model_name, model in models.items():
        # Train the model
        model.fit(X_train.toarray(), y_train)
        
        # Validate the model
        y_val_pred = model.predict(X_val.toarray())
        accuracy = accuracy_score(y_val, y_val_pred)
        recall = recall_score(y_val, y_val_pred, average='weighted')
        precision = precision_score(y_val, y_val_pred, average='weighted')
        f1 = f1_score(y_val, y_val_pred, average='weighted')
        
        # Append metrics to results
        validation_results.append((model_name, accuracy))
        metrics_results.append((model_name, 'Validation', accuracy, recall, precision, f1))
        
        # Display metrics
        print(f"Model: {model_name}")
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(f"Recall: {recall:.4f}, Precision: {precision:.4f}, F1 Score: {f1:.4f}\n")
        
        # Confusion Matrix Visualization
        plot_confusion_matrix(y_val, y_val_pred, model_name, 'Validation', test_size)
    
    # Plotting validation results
    plot_validation_results(validation_results, test_size)

    return metrics_results

# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name, phase, test_size):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_true), yticklabels=np.unique(y_true))
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {model_name} ({phase})')
    plt.savefig(f'{output_folder}/confusion_matrix_{model_name}_{phase.lower()}_test_size_{test_size}.png')
    plt.close()

# Function to plot validation results
def plot_validation_results(validation_results, test_size):
    model_names, accuracies = zip(*validation_results)
    plt.figure(figsize=(10, 5))
    plt.bar(model_names, accuracies, color='skyblue')
    plt.xlabel('Model')
    plt.ylabel('Validation Accuracy')
    plt.title(f'Validation Accuracy for Different Models (test_size={test_size})')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'{output_folder}/validation_accuracy_test_size_{test_size}.png')
    plt.close()

# Function to test models
def test_models(models, X_test, y_train, test_size):
    print("\nTesting Phase\n")
    test_results = []
    metrics_results = []
    
    for model_name, model in models.items():
        # Predict the test set
        y_test_pred = model.predict(X_test.toarray())
        
        # Store the predicted labels
        test_results.append((model_name, y_test_pred))
        
        # Append metrics for testing
        accuracy = accuracy_score(y_train[:len(y_test_pred)], y_test_pred)  # Assuming we use some labels from y_train for illustrative purposes
        recall = recall_score(y_train[:len(y_test_pred)], y_test_pred, average='weighted')
        precision = precision_score(y_train[:len(y_test_pred)], y_test_pred, average='weighted')
        f1 = f1_score(y_train[:len(y_test_pred)], y_test_pred, average='weighted')
        metrics_results.append((model_name, 'Testing', accuracy, recall, precision, f1))
        
        # Display the predicted labels
        print(f"Model: {model_name}")
        print(f"Predicted Labels: {y_test_pred}\n")
        
        # Confusion Matrix Visualization for Testing
        plot_confusion_matrix(y_train[:len(y_test_pred)], y_test_pred, model_name, 'Testing', test_size)
    
    # Save test results to CSV
    output_df = pd.DataFrame()
    for model_name, y_test_pred in test_results:
        output_df[model_name] = y_test_pred
    output_df.to_csv(f'{output_folder}/test_predictions_test_size_{test_size}.csv', index=False)
    print(f"Test predictions saved to '{output_folder}/test_predictions_test_size_{test_size}.csv'")

    return metrics_results

# Main function to execute the process
def main():
    X_train_val_raw, y_train_val = load_training_data()
    X_train_val, tfidf = feature_extraction(X_train_val_raw)
    X_test = load_testing_data(tfidf)
    models = initialize_models()

    # Define test sizes for splitting
    test_sizes = [0.2, 0.25, 0.3, 0.35]
    
    for test_size in test_sizes:
        print(f"\nUsing test_size = {test_size} for splitting the data\n")
        
        # Split the training and validation data
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=test_size, random_state=42)
        
        # Train and validate models
        metrics_results = train_and_validate_models(models, X_train, X_val, y_train, y_val, test_size)
        
        # Test models
        metrics_results += test_models(models, X_test, y_train, test_size)
        
        # Save metrics results to CSV
        metrics_df = pd.DataFrame(metrics_results
