In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import joblib

# Load the dataset
df = pd.read_csv('final_complaints.csv')

# Convert 'filing_date' to datetime
df['filing_date'] = pd.to_datetime(df['filing_date'])

# Extract features from 'filing_date'
df['filing_year'] = df['filing_date'].dt.year
df['filing_month'] = df['filing_date'].dt.month
df['filing_day'] = df['filing_date'].dt.day

# Save the original filing date before dropping it
original_filing_dates = df['filing_date'].copy()

# Drop 'filing_date'
df.drop('filing_date', axis=1, inplace=True)

# One-hot encoding for categorical variables
df = pd.get_dummies(df, drop_first=True)

# Features and target
X = df.drop('resolved_days_new', axis=1)
y = df['resolved_days_new']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (optional for Random Forest)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")

# Save the trained model
joblib.dump(model, 'resolution_time_predictor.pkl')

# ========== Predict on Full Dataset ==========

# Reload original dataset
new_data_df = pd.read_csv('final_complaints.csv')
new_data_df['filing_date'] = pd.to_datetime(new_data_df['filing_date'])

# Save filing_date for later use
original_filing_dates = new_data_df['filing_date'].copy()

# Extract features from filing_date
new_data_df['filing_year'] = new_data_df['filing_date'].dt.year
new_data_df['filing_month'] = new_data_df['filing_date'].dt.month
new_data_df['filing_day'] = new_data_df['filing_date'].dt.day

# Drop 'filing_date' for prediction
new_data_df.drop('filing_date', axis=1, inplace=True)

# One-hot encoding
new_data_df = pd.get_dummies(new_data_df, drop_first=True)

# Ensure the new data has the same columns as training data
new_data_df = new_data_df.reindex(columns=X.columns, fill_value=0)

# Standardize new data
new_data_scaled = scaler.transform(new_data_df)

# Make predictions
predicted_resolution_times = model.predict(new_data_scaled)

# ========== Display Results ==========

# Show first few predictions
num_display = 10
print(f"\nShowing first {num_display} predictions:\n")
for i in range(num_display):
    filing_date = original_filing_dates.iloc[i]
    expected_completion_date = filing_date + pd.to_timedelta(predicted_resolution_times[i], unit='D')
    print(f"Predicted Resolution Time for row {i+1}: {predicted_resolution_times[i]:.2f} days")
    print(f"Expected Completion Date for row {i+1}: {expected_completion_date.strftime('%d %B %Y')}\n")


Mean Absolute Error: 1.51

Showing first 10 predictions:

Predicted Resolution Time for row 1: 15.91 days
Expected Completion Date for row 1: 28 January 2024

Predicted Resolution Time for row 2: 0.00 days
Expected Completion Date for row 2: 28 February 2024

Predicted Resolution Time for row 3: 14.32 days
Expected Completion Date for row 3: 17 February 2024

Predicted Resolution Time for row 4: 0.00 days
Expected Completion Date for row 4: 09 April 2024

Predicted Resolution Time for row 5: 0.00 days
Expected Completion Date for row 5: 20 January 2024

Predicted Resolution Time for row 6: 0.00 days
Expected Completion Date for row 6: 28 March 2024

Predicted Resolution Time for row 7: 0.00 days
Expected Completion Date for row 7: 12 February 2024

Predicted Resolution Time for row 8: 23.35 days
Expected Completion Date for row 8: 29 April 2024

Predicted Resolution Time for row 9: 14.58 days
Expected Completion Date for row 9: 17 January 2024

Predicted Resolution Time for row 10: 0.0

In [5]:
from tqdm import tqdm

# ========== Display Results with Progress Bar ==========
from tqdm import tqdm

num_display = 10
print(f"\nShowing first {num_display} predictions:\n")

for i in tqdm(range(num_display), desc="Generating Predictions"):
    filing_date = original_filing_dates.iloc[i]
    expected_completion_date = filing_date + pd.to_timedelta(predicted_resolution_times[i], unit='D')
    print(f"Predicted Resolution Time for row {i+1}: {predicted_resolution_times[i]:.2f} days")
    print(f"Expected Completion Date for row {i+1}: {expected_completion_date.strftime('%d %B %Y')}\n")



Showing first 10 predictions:



Generating Predictions: 100%|████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 6365.62it/s]

Predicted Resolution Time for row 1: 15.91 days
Expected Completion Date for row 1: 28 January 2024

Predicted Resolution Time for row 2: 0.00 days
Expected Completion Date for row 2: 28 February 2024

Predicted Resolution Time for row 3: 14.32 days
Expected Completion Date for row 3: 17 February 2024

Predicted Resolution Time for row 4: 0.00 days
Expected Completion Date for row 4: 09 April 2024

Predicted Resolution Time for row 5: 0.00 days
Expected Completion Date for row 5: 20 January 2024

Predicted Resolution Time for row 6: 0.00 days
Expected Completion Date for row 6: 28 March 2024

Predicted Resolution Time for row 7: 0.00 days
Expected Completion Date for row 7: 12 February 2024

Predicted Resolution Time for row 8: 23.35 days
Expected Completion Date for row 8: 29 April 2024

Predicted Resolution Time for row 9: 14.58 days
Expected Completion Date for row 9: 17 January 2024

Predicted Resolution Time for row 10: 0.00 days
Expected Completion Date for row 10: 16 January 202




In [6]:
joblib.dump(model, 'resolution_time_predictor.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(list(X.columns), 'x_columns.pkl')  # Save column order


['x_columns.pkl']

In [7]:
joblib.dump(scaler, "scaler.pkl")
joblib.dump(list(X.columns), "x_columns.pkl")


['x_columns.pkl']

In [8]:
joblib.dump(model, 'resolution_time_predictor.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(list(X.columns), 'x_columns.pkl')


['x_columns.pkl']

In [11]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Sample data
departments = [
    "Municipal Waste Management Department",
    "Electricity Distribution Department",
    "Traffic and Transportation Department",
    "Urban Forestry and Landscaping Department",
    "Public Works Department",
    "Animal Control and Welfare Department",
    "Urban Lighting and Infrastructure Department",
    "Environmental Protection Department",
    "Water Supply and Sanitation Department",
    "Sewerage and Drainage Department"
]

# Create and fit label encoder
dept_label_encoder = LabelEncoder()
dept_label_encoder.fit(departments)

# Save the encoder
joblib.dump(dept_label_encoder, 'department_classifier.pkl')


['department_classifier.pkl']

In [12]:
priority_label_encoder = LabelEncoder()
priority_label_encoder.fit(['Low', 'Medium', 'High'])
joblib.dump(priority_label_encoder, 'priority_predictor.pkl')


['priority_predictor.pkl']