In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# civic_issues_dataset_gpt2.csv

In [2]:
# --- 1. LOAD AND PREPARE THE DATA ---
print("Loading dataset...")
try:
    df = pd.read_csv('civic_issues_dataset.csv')
except FileNotFoundError:
    print("Error: 'civic_issues_dataset.csv' not found.")
    print("Please run the 'generate_issues.py' script first to create the dataset.")
    exit()

# Define features (X) and targets (y)
X = df['Description']
y = df[['Category', 'Issue', 'Severity']]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Dataset loaded. Training with {len(X_train)} samples, testing with {len(X_test)} samples.")

Loading dataset...
Dataset loaded. Training with 24000 samples, testing with 6000 samples.


In [3]:
# --- 2. VECTORIZE THE TEXT DATA ---
print("\nVectorizing text descriptions using TF-IDF...")
# Initialize the TF-IDF Vectorizer
# It will learn the vocabulary from the training data and convert text to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit the vectorizer on the training data and transform it
X_train_tfidf = vectorizer.fit_transform(X_train)

# Only transform the test data using the already fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)
print("Text vectorization complete.")


Vectorizing text descriptions using TF-IDF...
Text vectorization complete.


In [4]:
# --- 3. TRAIN THE MULTI-OUTPUT CLASSIFICATION MODEL ---
print("\nTraining the Multi-Output Classifier...")
# We use Logistic Regression as the base estimator. It's fast and works well for text classification.
# MultiOutputClassifier will train one LogisticRegression classifier per target column ('Category', 'Issue', 'Severity').
base_classifier = LogisticRegression(solver='saga', penalty='l1', C=1.0, random_state=42)
multi_output_classifier = MultiOutputClassifier(estimator=base_classifier, n_jobs=-1)

# Train the model
multi_output_classifier.fit(X_train_tfidf, y_train)
print("Model training complete.")


Training the Multi-Output Classifier...
Model training complete.


In [5]:
# --- 4. EVALUATE THE MODEL ---
print("\nEvaluating the model on the test set...")
y_pred = multi_output_classifier.predict(X_test_tfidf)

# Convert predictions back to a DataFrame for easier evaluation
y_pred_df = pd.DataFrame(y_pred, columns=y_test.columns, index=y_test.index)

# Print classification report for each target variable
print("\n--- Classification Report for 'Category' ---")
print(classification_report(y_test['Category'], y_pred_df['Category']))

print("\n--- Classification Report for 'Issue' ---")
print(classification_report(y_test['Issue'], y_pred_df['Issue']))

print("\n--- Classification Report for 'Severity' ---")
print(classification_report(y_test['Severity'], y_pred_df['Severity']))


Evaluating the model on the test set...

--- Classification Report for 'Category' ---
                            precision    recall  f1-score   support

     Public Infrastructure       1.00      1.00      1.00       969
             Public Safety       1.00      1.00      1.00      1033
            Road & Traffic       1.00      1.00      1.00       978
Streetlights & Electricity       1.00      1.00      1.00      1024
        Waste & Sanitation       1.00      1.00      1.00       950
   Water Supply & Drainage       1.00      1.00      1.00      1046

                  accuracy                           1.00      6000
                 macro avg       1.00      1.00      1.00      6000
              weighted avg       1.00      1.00      1.00      6000


--- Classification Report for 'Issue' ---
                             precision    recall  f1-score   support

        broken streetlights       1.00      1.00      1.00       235
     broken traffic signals       1.00      1.00

In [6]:
# --- 5. SAVE THE MODEL AND VECTORIZER ---
print("\nSaving the trained model and vectorizer to disk...")
joblib.dump(multi_output_classifier, 'civic_issue_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("Model and vectorizer saved successfully as 'civic_issue_model.pkl' and 'tfidf_vectorizer.pkl'.")


Saving the trained model and vectorizer to disk...
✅ Model and vectorizer saved successfully as 'civic_issue_model.pkl' and 'tfidf_vectorizer.pkl'.


In [7]:
# --- 6. EXAMPLE PREDICTION ON NEW DATA ---
print("\n--- Example Prediction ---")
# Load the model and vectorizer (simulating a real-world application)
loaded_model = joblib.load('civic_issue_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Example new description
new_description = "Large potholes on the highway intersection have already caused multiple accidents."

# Vectorize the new description using the loaded vectorizer
new_description_tfidf = loaded_vectorizer.transform([new_description])

# Make a prediction
prediction = loaded_model.predict(new_description_tfidf)

# Print the result
print(f"Input Description: '{new_description}'")
print("\nPredicted Output:")
print(f"  - Category: {prediction[0][0]}")
print(f"  - Issue: {prediction[0][1]}")
print(f"  - Severity: {prediction[0][2]}")


--- Example Prediction ---
Input Description: 'Large potholes on the highway intersection have already caused multiple accidents.'

Predicted Output:
  - Category: Road & Traffic
  - Issue: potholes on the road
  - Severity: significant
