In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


# Create directory if not exists
output_folder = 'awal'
os.makedirs(output_folder, exist_ok=True)


In [96]:
# Function to preprocess text data
def preprocess_text(text):
    # Remove URLs, special characters, numbers, and make lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    
    # Tokenize and remove stopwords
    tokens = text.split()
    cleaned_tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    
    # Join tokens back to string
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

In [97]:
# Function to load and preprocess training dataset
def load_training_data():
    # Load training dataset
    train_val_dataset = pd.read_csv('../dataset/youtube_spam.csv')  # Load the uploaded training and validation dataset

    # Preprocess 'CONTENT' column
    train_val_dataset['clean_content'] = train_val_dataset['CONTENT'].apply(preprocess_text)
    
    # Use only 'CONTENT' as features and 'CLASS' as the target variable
    X_train_val_raw = train_val_dataset['clean_content']  # Features (content of the comments)
    y_train_val = train_val_dataset['CLASS']        # Target variable (spam or not spam)

    return X_train_val_raw, y_train_val

In [98]:

# Function to perform feature extraction using TF-IDF
def feature_extraction(X_train_val_raw):
    # Convert text data to numerical data using TF-IDF Vectorizer
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train_val = tfidf.fit_transform(X_train_val_raw)
    return X_train_val, tfidf

# Function to load and preprocess testing dataset
def load_testing_data(tfidf):
    # Load and preprocess the testing dataset
    test_dataset = pd.read_excel('../dataset/youtube_comments_i6IOiUi6IYY.xlsx')  # Load the testing dataset
    test_dataset['cleaned_comment'] = test_dataset['Comment'].apply(preprocess_text)  # Preprocess 'Comment' column
    X_test_raw = test_dataset['cleaned_comment']  # Use the cleaned comments for testing
    X_test = tfidf.transform(X_test_raw)  # Transform the test dataset using the same TF-IDF Vectorizer

    # Add original and cleaned comments to dataframe for analysis
    df = pd.DataFrame({'comment': test_dataset['Comment'], 'clean_comment': test_dataset['cleaned_comment']})

    # Save the processed test dataset to CSV and Excel
    test_dataset.to_csv(f'{output_folder}/processed_test_dataset.csv', index=False)
    test_dataset.to_excel(f'{output_folder}/processed_test_dataset.xlsx', index=False)
    print(f"Processed test dataset saved to '{output_folder}/processed_test_dataset.csv' and '{output_folder}/processed_test_dataset.xlsx'")

    # return X_test
    return X_test, df


In [99]:

# Function to initialize models
def initialize_models():
    return {
        'Naive Bayes': GaussianNB(),
        'SVM (linear kernel)': SVC(kernel='linear'),
        'SVM (poly kernel)': SVC(kernel='poly'),
        'SVM (rbf kernel)': SVC(kernel='rbf'),
        'SVM (sigmoid kernel)': SVC(kernel='sigmoid'),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    }


In [100]:
from sklearn.model_selection import cross_val_score
# Function to train and validate models
def train_and_validate_models(models, X_train, X_val, y_train, y_val, test_size):
    validation_results = []
    metrics_results = []

    print("\nTraining and Validation Phase\n")
    for model_name, model in models.items():
        # Apply cross-validation
        # scores = cross_val_score(model, X_train.toarray(), y_train, cv=5, scoring='accuracy')
        # average_accuracy = scores.mean()

        # Train the model
        model.fit(X_train.toarray(), y_train)
        
        # Save the trained model
        model_filename = f'{output_folder}/{model_name.replace(" ", "_")}_model.pkl'
        joblib.dump(model, model_filename)
        print(f"Model {model_name} saved to {model_filename}")
        
        # Validate the model
        y_val_pred = model.predict(X_val.toarray())
        accuracy = accuracy_score(y_val, y_val_pred)
        recall = recall_score(y_val, y_val_pred, average='weighted')
        precision = precision_score(y_val, y_val_pred, average='weighted')
        f1 = f1_score(y_val, y_val_pred, average='weighted')
        
        # Append metrics to results
        validation_results.append((model_name, accuracy))
        metrics_results.append((model_name, 'Validation', accuracy, recall, precision, f1))
        
        # Display metrics
        print(f"Model: {model_name}")
        # print(f"Cross-Validation Accuracy (5 folds): {average_accuracy:.4f}")

        print(f"Validation Accuracy: {accuracy:.4f}")
        print(f"Recall: {recall:.4f}, Precision: {precision:.4f}, F1 Score: {f1:.4f}\n")
        
        # Confusion Matrix Visualization
        plot_confusion_matrix(y_val, y_val_pred, model_name, 'Validation', test_size)
    
    # Plotting validation results
    plot_validation_results(validation_results, test_size)

    return metrics_results

# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name, phase, test_size):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_true), yticklabels=np.unique(y_true))
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {model_name} ({phase})')
    plt.savefig(f'{output_folder}/confusion_matrix_{model_name}_{phase.lower()}_test_size_{test_size}.png')
    plt.close()

# Function to plot validation results
def plot_validation_results(validation_results, test_size):
    model_names, accuracies = zip(*validation_results)
    plt.figure(figsize=(10, 5))
    plt.bar(model_names, accuracies, color='skyblue')
    plt.xlabel('Model')
    plt.ylabel('Validation Accuracy')
    plt.title(f'Validation Accuracy for Different Models (test_size={test_size})')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'{output_folder}/validation_accuracy_test_size_{test_size}.png')
    plt.close()

In [None]:
# Function to test models
def test_models(models, X_test, df,test_size):
    print("\nTesting Phase\n")
    test_results = []
    metrics_results = []
    
    for model_name in models.keys():
        # Load the trained model
        model_filename = f'{output_folder}/{model_name.replace(" ", "_")}_model.pkl'
        model = joblib.load(model_filename)
        print(f"Model {model_name} loaded from {model_filename}")
        
        # Predict the test set
        y_test_pred = model.predict(X_test.toarray())
        
        # Store the predicted labels
        test_results.append((model_name, y_test_pred))
        
        # Display the predicted labels
        print(f"Model: {model_name}")
        print(f"Predicted Labels: {y_test_pred}\n")
        
        # Confusion Matrix Visualization for Testing
        # Assuming we use some dummy labels for testing purposes (since actual test labels are not provided)
        # y_test_dummy = [0] * len(y_test_pred)  # Example of creating dummy labels
        # plot_confusion_matrix(y_test_dummy, y_test_pred, model_name, 'Testing', test_size)
        # If actual labels are available for testing (optional)
        if 'actual_labels' in df.columns:
            y_test_actual = df['actual_labels']

            # Calculate the metrics for testing
            accuracy = accuracy_score(y_test_actual, y_test_pred)
            recall = recall_score(y_test_actual, y_test_pred, average='weighted')
            precision = precision_score(y_test_actual, y_test_pred, average='weighted')
            f1 = f1_score(y_test_actual, y_test_pred, average='weighted')

            # Append metrics to results
            metrics_results.append((model_name, test_size, accuracy, precision, recall, f1))

            # Confusion Matrix Visualization for Testing
            plot_confusion_matrix(y_test_actual, y_test_pred, model_name, 'Testing', test_size)
        else:
            print(f"Warning: Actual labels are not provided for meaningful evaluation of {model_name}")

    # Save test results to CSV
    # output_df = pd.DataFrame()
    # df = X_test.copy()
    # OK
    # -----------------
    # output_df = pd.DataFrame({'comment': df['comment'], 'clean_comment': df['clean_comment']})
    # for model_name, y_test_pred in test_results:
    #     output_df[model_name] = y_test_pred
    # output_df.to_csv(f'{output_folder}/test_predictions_test_size_{test_size}.csv', index=False)
    # print(f"Test predictions saved to '{output_folder}/test_predictions_test_size_{test_size}.csv'")
    #---------------------

    # Save test results to CSV including original and cleaned comments
    output_df = df.copy()
    for model_name, y_test_pred in test_results:
        output_df[model_name] = y_test_pred
    output_df.to_csv(f'{output_folder}/test_predictions_test_size_{test_size}.csv', index=False)
    print(f"Test predictions saved to '{output_folder}/test_predictions_test_size_{test_size}.csv'")

    # Save metrics results to CSV
    if metrics_results:
        metrics_df = pd.DataFrame(metrics_results, columns=['Model', 'Test_Size', 'Accuracy', 'Precision', 'Recall', 'F1'])
        metrics_df.to_csv(f'{output_folder}/test_metrics_results_test_size_{test_size}.csv', index=False)
        print(f"Test metrics results saved to '{output_folder}/test_metrics_results_test_size_{test_size}.csv'")
    
    # Display statistics of prediction results
    # for model_name, y_test_pred in test_results:
    #     count_0 = (y_test_pred == 0).sum()
    #     count_1 = (y_test_pred == 1).sum()
    #     print(f"Model: {model_name}")
    #     print(f"Predicted 'ham' (0): {count_0}")
    #     print(f"Predicted 'spam' (1): {count_1}\n")

    # Display statistics of prediction results and save
    stats = []
    for model_name, _ in test_results:
        count_0 = output_df[model_name].value_counts().get(0, 0)
        count_1 = output_df[model_name].value_counts().get(1, 0)
        stats.append({'Model': model_name, 'Predicted_Ham': count_0, 'Predicted_Spam': count_1})
        print(f"Model: {model_name}")
        print(f"Predicted 'ham' (0): {count_0}")
        print(f"Predicted 'spam' (1): {count_1}\n")
    
    # Save statistics to CSV
    stats_df = pd.DataFrame(stats)
    stats_df.to_csv(f'{output_folder}/prediction_statistics_test_size_{test_size}.csv', index=False)
    print(f"Prediction statistics saved to '{output_folder}/prediction_statistics_test_size_{test_size}.csv'")

    # Visualize the statistics
    plt.figure(figsize=(12, 6))
    for model_stat in stats:
        plt.bar(model_stat['Model'], model_stat['Predicted_Ham'], color='blue', alpha=0.6, label='Ham')
        plt.bar(model_stat['Model'], model_stat['Predicted_Spam'], bottom=model_stat['Predicted_Ham'], color='red', alpha=0.6, label='Spam')
    
    plt.xlabel('Model')
    plt.ylabel('Count of Predictions')
    plt.title(f'Prediction Counts for Each Model (test_size={test_size})')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'{output_folder}/prediction_statistics_visualization_test_size_{test_size}.png')
    plt.close()

    return metrics_results

In [102]:
# Main function to execute the process
def main():
    X_train_val_raw, y_train_val = load_training_data()
    X_train_val, tfidf = feature_extraction(X_train_val_raw)
    X_test, df = load_testing_data(tfidf)
    models = initialize_models()

    # Define test sizes for splitting
    test_sizes = [0.2, 0.25, 0.3, 0.35]
    # test_sizes = [0.2,0.3]
    
    for test_size in test_sizes:
        print(f"\nUsing test_size = {test_size} for splitting the data\n")
        
        # Split the training and validation data
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=test_size, random_state=42)
        
        # Train and validate models
        metrics_results = train_and_validate_models(models, X_train, X_val, y_train, y_val, test_size)
        
        # Test models
        metrics_results += test_models(models, X_test,df, test_size)
        
        # Save metrics results to CSV
        metrics_df = pd.DataFrame(metrics_results, columns=['Model', 'Phase', 'Accuracy', 'Recall', 'Precision', 'F1'])
        metrics_df.to_csv(f'{output_folder}/metrics_results_test_size_{test_size}.csv', index=False)
        print(f"Metrics results saved to '{output_folder}/metrics_results_test_size_{test_size}.csv'")

if __name__ == "__main__":
    main()


Processed test dataset saved to 'awal/processed_test_dataset.csv' and 'awal/processed_test_dataset.xlsx'

Using test_size = 0.2 for splitting the data


Training and Validation Phase

Model Naive Bayes saved to awal/Naive_Bayes_model.pkl
Model: Naive Bayes
Validation Accuracy: 0.7423
Recall: 0.7423, Precision: 0.7586, F1 Score: 0.7423

Model SVM (linear kernel) saved to awal/SVM_(linear_kernel)_model.pkl
Model: SVM (linear kernel)
Validation Accuracy: 0.8980
Recall: 0.8980, Precision: 0.9076, F1 Score: 0.8982

Model SVM (poly kernel) saved to awal/SVM_(poly_kernel)_model.pkl
Model: SVM (poly kernel)
Validation Accuracy: 0.8699
Recall: 0.8699, Precision: 0.8703, F1 Score: 0.8694

Model SVM (rbf kernel) saved to awal/SVM_(rbf_kernel)_model.pkl
Model: SVM (rbf kernel)
Validation Accuracy: 0.9107
Recall: 0.9107, Precision: 0.9146, F1 Score: 0.9109

Model SVM (sigmoid kernel) saved to awal/SVM_(sigmoid_kernel)_model.pkl
Model: SVM (sigmoid kernel)
Validation Accuracy: 0.8801
Recall: 0.8801,

KeyboardInterrupt: 