In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')

# Function to load the AG News dataset from CSV files or create a split from a single file
def load_ag_news_from_files():
    try:
        # Option 1: User provides separate train and test files
        train_path = input("Enter path to AG News training CSV file (or enter 'single' if you have only one file): ")

        if train_path.lower() == 'single':
            # Option 2: User provides a single file that we'll split
            file_path = input("Enter path to your single AG News CSV file: ")
            df = pd.read_csv(file_path)

            # Confirm column names
            print("Available columns in your file:", df.columns.tolist())
            text_col = input("Enter the name of the text column: ")
            label_col = input("Enter the name of the label/class column: ")

            # Create train/test split
            train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[label_col])

            # Rename columns for consistency
            train_df = train_df.rename(columns={text_col: 'text', label_col: 'class'})
            test_df = test_df.rename(columns={text_col: 'text', label_col: 'class'})

        else:
            # Load separate train and test files
            test_path = input("Enter path to AG News test CSV file: ")

            train_df = pd.read_csv(train_path)
            test_df = pd.read_csv(test_path)

            # Confirm column names
            print("Available columns in your training file:", train_df.columns.tolist())
            text_col = input("Enter the name of the text column: ")
            label_col = input("Enter the name of the label/class column: ")

            # Rename columns for consistency
            train_df = train_df.rename(columns={text_col: 'text', label_col: 'class'})
            test_df = test_df.rename(columns={text_col: 'text', label_col: 'class'})

        print(f"Training data size: {len(train_df)}")
        print(f"Test data size: {len(test_df)}")

        return train_df, test_df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None, None

# Alternative option: Create a simulated dataset for testing
def create_sample_dataset():
    print("Creating a small sample dataset for testing...")

    # For testing: Creating a small sample dataset
    from sklearn.datasets import fetch_20newsgroups

    # Get subset of 20 newsgroups for 4 categories (similar to AG News)
    categories = ['comp.graphics', 'sci.med', 'rec.sport.baseball', 'talk.politics.misc']
    newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

    # Create dataframe
    df = pd.DataFrame({
        'text': newsgroups.data,
        'class': newsgroups.target
    })

    # Split into train and test
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['class'])

    print(f"Created sample dataset from 20 Newsgroups")
    print(f"Training data size: {len(train_df)}")
    print(f"Test data size: {len(test_df)}")
    print(f"This sample uses classes 0-3, representing: {categories}")

    return train_df, test_df

# Feature extraction with TF-IDF
def extract_features(train_texts, test_texts):
    print("Extracting TF-IDF features...")
    vectorizer = TfidfVectorizer(
        max_features=20000,
        ngram_range=(1, 2),  # Unigrams and bigrams
        stop_words='english',
        min_df=5
    )

    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)

    print(f"Training features shape: {X_train.shape}")
    print(f"Test features shape: {X_test.shape}")

    return X_train, X_test, vectorizer

# Function to evaluate models and return metrics
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Function to train and evaluate each model
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name):
    print(f"\n===== Training {model_name} =====")

    # Training
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    print(f"Training time: {train_time:.2f} seconds")

    # Prediction
    start_time = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_time
    print(f"Prediction time: {predict_time:.2f} seconds")

    # Evaluation
    metrics = evaluate_model(y_test, y_pred)

    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1-score: {metrics['f1']:.4f}")

    # Detailed report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    return {
        'model_name': model_name,
        'model': model,
        'accuracy': metrics['accuracy'],
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'f1': metrics['f1'],
        'train_time': train_time,
        'predict_time': predict_time,
        'predictions': y_pred
    }

# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name, class_names=None):
    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_{model_name.replace(" ", "_").lower()}.png')
    plt.close()

# Function to compare model performances
def compare_models(results):
    metrics = ['accuracy', 'precision', 'recall', 'f1']
    models = [result['model_name'] for result in results]

    # Create comparison dataframe
    comparison_df = pd.DataFrame({
        'Model': models,
        'Accuracy': [result['accuracy'] for result in results],
        'Precision': [result['precision'] for result in results],
        'Recall': [result['recall'] for result in results],
        'F1-score': [result['f1'] for result in results],
        'Training Time (s)': [result['train_time'] for result in results],
        'Prediction Time (s)': [result['predict_time'] for result in results]
    })

    print("\n===== Model Comparison =====")
    print(comparison_df.to_string(index=False))

    # Save comparison to CSV
    comparison_df.to_csv('model_comparison.csv', index=False)

    # Plotting accuracy, precision, recall, f1
    plt.figure(figsize=(12, 8))

    bar_width = 0.2
    index = np.arange(len(models))

    for i, metric in enumerate(metrics):
        values = [result[metric] for result in results]
        plt.bar(index + i*bar_width, values, bar_width, label=metric.capitalize())

    plt.xlabel('Models')
    plt.ylabel('Score')
    plt.title('Model Performance Comparison')
    plt.xticks(index + bar_width*1.5, models, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig('model_performance_comparison.png')
    plt.close()

    # Plot training and prediction times
    plt.figure(figsize=(12, 6))

    train_times = [result['train_time'] for result in results]
    predict_times = [result['predict_time'] for result in results]

    x = np.arange(len(models))
    width = 0.35

    plt.bar(x - width/2, train_times, width, label='Training Time (s)')
    plt.bar(x + width/2, predict_times, width, label='Prediction Time (s)')

    plt.xlabel('Models')
    plt.ylabel('Time (seconds)')
    plt.title('Model Training and Prediction Times')
    plt.xticks(x, models, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig('model_time_comparison.png')
    plt.close()

    return comparison_df

def main():
    print("AG News Text Classification with Multiple ML Models")
    print("=================================================")
    print("1. Load data from local CSV files")
    print("2. Create sample dataset for testing (using 20 Newsgroups)")
    choice = input("Enter your choice (1/2): ")

    if choice == "1":
        # Load AG News dataset from files
        train_df, test_df = load_ag_news_from_files()
    else:
        # Create sample dataset
        train_df, test_df = create_sample_dataset()

    if train_df is None:
        print("Could not load or create dataset. Exiting.")
        return

    # Extract features
    X_train, X_test, vectorizer = extract_features(train_df['text'], test_df['text'])
    y_train = train_df['class']
    y_test = test_df['class']

    # Determine number of classes and create class names
    num_classes = len(np.unique(y_train))
    if num_classes == 4:  # Assuming AG News 4 classes
        class_names = ["World", "Sports", "Business", "Sci/Tech"]
    else:
        class_names = [f"Class {i}" for i in range(num_classes)]

    print(f"Detected {num_classes} classes: {class_names}")

    # Define all models to train and evaluate
    models = [
        {
            'name': 'Logistic Regression',
            'model': LogisticRegression(C=1.0, max_iter=100, solver='liblinear', random_state=42)
        },
        {
            'name': 'Multinomial Naive Bayes',
            'model': MultinomialNB(alpha=0.1)
        },
        {
            'name': 'Linear SVM',
            'model': LinearSVC(C=1.0, max_iter=1000, dual=False, random_state=42)
        },
        {
            'name': 'Random Forest',
            'model': RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1, random_state=42)
        },
        {
            'name': 'K-Nearest Neighbors',
            'model': KNeighborsClassifier(n_neighbors=5, weights='distance', n_jobs=-1)
        }
    ]

    # Train and evaluate all models
    results = []
    for model_info in models:
        result = train_and_evaluate(
            model_info['model'],
            X_train, y_train,
            X_test, y_test,
            model_info['name']
        )
        results.append(result)

        # Plot confusion matrix
        plot_confusion_matrix(y_test, result['predictions'], model_info['name'], class_names)

    # Compare models
    comparison_df = compare_models(results)

    # Identify best model
    best_model = max(results, key=lambda x: x['f1'])
    print(f"\nBest performing model: {best_model['model_name']}")
    print(f"F1-score: {best_model['f1']:.4f}")
    print(f"Accuracy: {best_model['accuracy']:.4f}")

    # Optional: Write summary observations
    with open('model_observations.txt', 'w') as f:
        f.write("# Text Classification Model Comparison on AG News Dataset\n\n")
        f.write("## Performance Summary\n\n")
        f.write(f"{comparison_df.to_string(index=False)}\n")
        f.write("\n\n## Observations\n\n")
        f.write(f"- Best performing model: {best_model['model_name']} with F1-score of {best_model['f1']:.4f}\n")

        # Add speed observations
        fastest_training = min(results, key=lambda x: x['train_time'])
        fastest_prediction = min(results, key=lambda x: x['predict_time'])
        f.write(f"- Fastest training model: {fastest_training['model_name']} ({fastest_training['train_time']:.2f}s)\n")
        f.write(f"- Fastest prediction model: {fastest_prediction['model_name']} ({fastest_prediction['predict_time']:.2f}s)\n")

        # Model strengths and trade-offs
        f.write("\n## Model Strengths and Trade-offs\n\n")
        f.write("### Logistic Regression\n")
        f.write("- Good balance between accuracy and training speed\n")
        f.write("- Works well with high-dimensional sparse data like TF-IDF\n")

        f.write("\n### Multinomial Naive Bayes\n")
        f.write("- Very fast training and prediction\n")
        f.write("- Good performance for text classification with count-based features\n")

        f.write("\n### Linear SVM\n")
        f.write("- Usually achieves high accuracy on text classification tasks\n")
        f.write("- Works well with high-dimensional data but can be slow to train on large datasets\n")

        f.write("\n### Random Forest\n")
        f.write("- Robust to overfitting\n")
        f.write("- Can capture non-linear relationships but may not be ideal for high-dimensional sparse data\n")

        f.write("\n### K-Nearest Neighbors\n")
        f.write("- Simple implementation but often slower for predictions\n")
        f.write("- Performance heavily dependent on feature scaling and neighborhood size\n")

if __name__ == "__main__":
    main()

AG News Text Classification with Multiple ML Models
1. Load data from local CSV files
2. Create sample dataset for testing (using 20 Newsgroups)
Enter your choice (1/2): 1
Enter path to AG News training CSV file (or enter 'single' if you have only one file): /content/train.csv
Enter path to AG News test CSV file: /content/test.csv
Available columns in your training file: ['Class Index', 'Title', 'Description']
Enter the name of the text column: Description
Enter the name of the label/class column: Class Index
Training data size: 120000
Test data size: 7600
Extracting TF-IDF features...
Training features shape: (120000, 20000)
Test features shape: (7600, 20000)
Detected 4 classes: ['World', 'Sports', 'Business', 'Sci/Tech']

===== Training Logistic Regression =====
Training time: 6.39 seconds
Prediction time: 0.00 seconds
Accuracy: 0.9089
Precision: 0.9087
Recall: 0.9089
F1-score: 0.9087

Classification Report:
              precision    recall  f1-score   support

           1       0.