In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import time

# --- Import the models we want to compare ---
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

# --- 1. LOAD AND PREPARE THE DATA ---
print("Loading and preparing the dataset...")

def generate_dummy_text_data(filename='civic_issues_dataset.csv'):
    """Generates a dummy CSV file if it doesn't exist."""
    if pd.io.common.file_exists(filename):
        print(f"Dataset '{filename}' already exists. Skipping generation.")
        return
    print(f"Creating dummy dataset '{filename}'...")
    data = {
        'Description': [
            "The traffic signal at main street is broken and causing jams.",
            "Huge pothole on the highway near the exit ramp.",
            "Graffiti spray painted on the park walls again.",
            "Garbage has not been collected from our neighborhood for a week.",
            "A streetlight on elm street is flickering constantly.",
            "Missed garbage pickup for the third time this month.",
            "Dangerous pothole reported on the corner of 5th and oak.",
            "The park swings are broken and unsafe for children.",
            "Illegal dumping of trash in the empty lot.",
            "The downtown traffic lights are out of sync."
        ] * 10, # Multiply data to get a larger dataset
        'Category': ['Traffic', 'Roads', 'Vandalism', 'Sanitation', 'Utilities', 'Sanitation', 'Roads', 'Parks', 'Sanitation', 'Traffic'] * 10,
        'Issue': ['Broken Signal', 'Pothole', 'Graffiti', 'Missed Pickup', 'Broken Streetlight', 'Missed Pickup', 'Pothole', 'Broken Equipment', 'Illegal Dumping', 'Broken Signal'] * 10,
        'Severity': ['High', 'Medium', 'Low', 'High', 'Medium', 'High', 'High', 'Medium', 'Medium', 'Low'] * 10
    }
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print("✅ Dummy dataset created.")

# Generate dummy data if the CSV is not present
generate_dummy_text_data()
df = pd.read_csv('civic_issues_dataset.csv')

# Define features (X) and targets (y)
X = df['Description']
y = df[['Category', 'Issue', 'Severity']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Dataset loaded. Training with {len(X_train)} samples, testing with {len(X_test)} samples.")


# --- 2. VECTORIZE THE TEXT DATA ---
print("\nVectorizing text descriptions using TF-IDF...")
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print("Text vectorization complete.")


# --- 3. DEFINE AND COMPARE MODELS ---
print("\n--- Starting Model Comparison ---")

# Define the base classifiers you want to compare
models_to_compare = {
    "Logistic Regression": LogisticRegression(solver='liblinear', random_state=42),
    "Linear SVM (LinearSVC)": LinearSVC(random_state=42, dual=True), # dual=True is often better for this scenario
    "Multinomial Naive Bayes": MultinomialNB()
}

results = {}
best_model_name = None
best_model_accuracy = 0.0
best_model_object = None

for name, model in models_to_compare.items():
    print(f"\nTraining {name}...")
    start_time = time.time()

    # Wrap each base estimator in a MultiOutputClassifier
    multi_output_model = MultiOutputClassifier(estimator=model, n_jobs=-1)
    multi_output_model.fit(X_train_tfidf, y_train)
    
    # Make predictions
    y_pred = multi_output_model.predict(X_test_tfidf)
    y_pred_df = pd.DataFrame(y_pred, columns=y_test.columns, index=y_test.index)
    
    # --- Calculate Average Accuracy ---
    # 'multiclass-multioutput' is not supported by a single accuracy_score call.
    # We calculate the accuracy for each output column and then average them.
    accuracies = []
    for column in y_test.columns:
        col_accuracy = accuracy_score(y_test[column], y_pred_df[column])
        accuracies.append(col_accuracy)
    
    # The main metric for comparison will be the average accuracy
    average_accuracy = sum(accuracies) / len(accuracies)
    
    end_time = time.time()
    training_time = end_time - start_time
    
    print(f"✅ {name} trained in {training_time:.2f} seconds.")
    print(f"   Average Accuracy (across all outputs): {average_accuracy:.4f}")
    
    # Store results
    results[name] = average_accuracy
    
    # --- Detailed Report ---
    # This report is still valuable for seeing per-class performance
    print("\n--- Detailed Classification Report ---")
    print(classification_report(y_test, y_pred_df, target_names=y_test.columns.tolist(), zero_division=0))

    # Check if this is the best model so far
    if average_accuracy > best_model_accuracy:
        best_model_accuracy = average_accuracy
        best_model_name = name
        best_model_object = multi_output_model

# --- 4. SAVE THE BEST MODEL ---
print("\n--- Comparison Summary ---")
for name, acc in results.items():
    print(f"- {name}: Average Accuracy = {acc:.4f}")

print(f"\n🏆 The best performing model is: **{best_model_name}** with an average accuracy of {best_model_accuracy:.4f}")

print("\nSaving the best model and the vectorizer to disk...")
joblib.dump(best_model_object, 'civic_issue_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("✅ Best model and vectorizer saved successfully!")



Loading and preparing the dataset...
Dataset 'civic_issues_dataset.csv' already exists. Skipping generation.
Dataset loaded. Training with 24000 samples, testing with 6000 samples.

Vectorizing text descriptions using TF-IDF...
Text vectorization complete.

--- Starting Model Comparison ---

Training Logistic Regression...
✅ Logistic Regression trained in 2.15 seconds.
   Average Accuracy (across all outputs): 0.7036

--- Detailed Classification Report ---


ValueError: multiclass-multioutput is not supported