In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import pickle

# Load the dataset
df = pd.read_csv('formatted_fiscal_logs.csv')

# Data Preparation
# Handle 'month' column if it's a full month name like 'January 2024'
if df['month'].dtype == 'object':  # Check if 'month' is a string
    # Split into month name and year if the format is 'January 2024'
    df[['month_name', 'year']] = df['month'].str.split(' ', expand=True)
    
    # Convert month name to numeric month (1 = January, 2 = February, etc.)
    df['month'] = pd.to_datetime(df['month_name'], format='%B').dt.month
    
    # Drop 'month_name' column
    df = df.drop(columns=['month_name'])

# Label Encoding for 'year' (if year is categorical, otherwise it's numeric)
le_year = LabelEncoder()
df['year_encoded'] = le_year.fit_transform(df['year'])

# Label Encoding for 'department'
le_dept = LabelEncoder()
df['department_encoded'] = le_dept.fit_transform(df['department'])

# Feature selection (we'll use encoded department, encoded year, and other numeric features)
X = df[['department_encoded', 'year_encoded', 'total_spent', 'complaints_handled', 'avg_cost', 'month']]
y = df['performance']  # Assuming 'performance' is the target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Training - Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model to a .pkl file
with open('trained_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the label encoders to pickle files for later use
with open('label_encoder_dept.pkl', 'wb') as f:
    pickle.dump(le_dept, f)

with open('label_encoder_year.pkl', 'wb') as f:
    pickle.dump(le_year, f)

# Prediction & Insights
# Predict on test data
y_pred = model.predict(X_test)

# Calculate performance (accuracy, etc.)
accuracy = model.score(X_test, y_test)
print(f'Accuracy of the model: {accuracy * 100:.2f}%')

# Rank departments by predicted performance
df_test = X_test.copy()
df_test['predicted_performance'] = y_pred
df_test['department'] = le_dept.inverse_transform(df_test['department_encoded'])

# Convert 'Good' and 'Bad' to numeric values
performance_mapping = {'Good': 1, 'Bad': 0}
df_test['predicted_performance_numeric'] = df_test['predicted_performance'].map(performance_mapping)

# Rank departments by predicted performance
ranked_departments = df_test.groupby('department')['predicted_performance_numeric'].mean().sort_values(ascending=False)

# Display the ranked departments
print("Departments ranked by predicted performance:")
print(ranked_departments)

# Identify departments with consistently low performance (e.g., below average prediction)
threshold = ranked_departments.median()
low_performance_departments = ranked_departments[ranked_departments < threshold]
print(f"Departments with consistently low performance: {low_performance_departments}")


Accuracy of the model: 100.00%
Departments ranked by predicted performance:
department
Sewerage and Drainage Department             0.543860
Municipal Waste Management Department        0.529412
Water Supply and Sanitation Department       0.520408
Urban Forestry and Landscaping Department    0.512821
Electricity Distribution Department          0.491071
Traffic and Transportation Department        0.490385
Environmental Protection Department          0.466102
Animal Control and Welfare Department        0.454545
Public Works Department                      0.403846
Name: predicted_performance_numeric, dtype: float64
Departments with consistently low performance: department
Traffic and Transportation Department    0.490385
Environmental Protection Department      0.466102
Animal Control and Welfare Department    0.454545
Public Works Department                  0.403846
Name: predicted_performance_numeric, dtype: float64


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import pickle

# Load dataset
df = pd.read_csv('formatted_fiscal_logs.csv')

# Handle month column if needed
if df['month'].dtype == 'object':
    try:
        df[['month_name', 'year']] = df['month'].str.split(' ', expand=True)
        df['month'] = pd.to_datetime(df['month_name'], format='%B').dt.month
        df.drop(columns=['month_name'], inplace=True)
    except:
        df['month'] = pd.to_datetime(df['month'], errors='coerce').dt.month

# Convert year to integer if present
try:
    df['year'] = df['year'].astype(int)
except:
    df['year'] = pd.to_numeric(df['year'], errors='coerce')

# Encode department (for reverse mapping later)
le_dept = LabelEncoder()
df['department_encoded'] = le_dept.fit_transform(df['department'])

with open('label_encoder_dept.pkl', 'wb') as f:
    pickle.dump(le_dept, f)

# ------ Features & Target ------
features = ['month', 'total_spent', 'complaints_handled', 'avg_cost']
X = df[features]
y = df['performance_percentage']  # Ensure this column has numeric values (0–100)

# ------ Train/Test Split ------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ------ Model Training ------
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save trained model
with open('trained_model_regressor.pkl', 'wb') as f:
    pickle.dump(model, f)

# ------ Predict on test set ------
y_pred = model.predict(X_test)

# Attach predictions to test set
df_test = X_test.copy()
df_test['predicted_performance'] = y_pred
df_test['department_encoded'] = df.loc[df_test.index, 'department_encoded']
df_test['department'] = le_dept.inverse_transform(df_test['department_encoded'])

# ------ Visual: Average predicted performance per department ------
department_perf = df_test.groupby('department')['predicted_performance'].mean().sort_values(ascending=False)

print("Average Predicted Performance by Department:")
print(department_perf)

# ------ Plot Bar Chart ------
plt.figure(figsize=(10,6))
department_perf.plot(kind='bar', color='skyblue')
plt.title('Department Performance Comparison (Predicted %)')
plt.xlabel('Department')
plt.ylabel('Performance (%)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [12]:
import pandas as pd

# Load the dataset
df = pd.read_csv('formatted_fiscal_logs.csv')

# Example logic for performance: If total_spent is high and complaints_handled are low, mark as 'Good', else 'Bad'
df['performance'] = df.apply(lambda row: 'Good' if row['total_spent'] > 1000 and row['complaints_handled'] < 50 else 'Bad', axis=1)

# Save the updated DataFrame back to CSV
df.to_csv('formatted_fiscal_logs.csv', index=False)


In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle

# Load the saved model and label encoders
with open('trained_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('label_encoder_dept.pkl', 'rb') as f:
    le_dept = pickle.load(f)

with open('label_encoder_year.pkl', 'rb') as f:
    le_year = pickle.load(f)

# Function to predict performance for a given department, month, and year
def predict_performance(department, month, year, total_spent, complaints_handled, avg_cost):
    # Label encode the department and year
    department_encoded = le_dept.transform([department])[0]
    year_encoded = le_year.transform([year])[0]
    
    # Prepare the input features for prediction
    X_input = np.array([[department_encoded, year_encoded, total_spent, complaints_handled, avg_cost, month]])

    # Make the prediction using the trained model
    prediction = model.predict(X_input)
    
    # Convert the numeric prediction back to the original 'Good' or 'Bad' performance
    performance_mapping = {1: 'Good', 0: 'Bad'}
    predicted_performance = performance_mapping[prediction[0]]

    return predicted_performance

# Example usage:
department_input = input("Enter the department: ")
month_input = int(input("Enter the month (1-12): "))
year_input = int(input("Enter the year (e.g., 2023): "))
total_spent_input = float(input("Enter the total amount spent: "))
complaints_handled_input = int(input("Enter the number of complaints handled: "))
avg_cost_input = float(input("Enter the average cost: "))

# Predict the performance for the given inputs
predicted_performance = predict_performance(department_input, month_input, year_input, total_spent_input, complaints_handled_input, avg_cost_input)

print(f"The predicted performance for the {department_input} in {month_input}/{year_input} is: {predicted_performance}")


Enter the department:  Sewerage and Drainage Department 
Enter the month (1-12):  4
Enter the year (e.g., 2023):  2025
Enter the total amount spent:  5000
Enter the number of complaints handled:  58
Enter the average cost:  888


ValueError: y contains previously unseen labels: 'Sewerage and Drainage Department '

import pandas as pd
import numpy as np
import pickle

# Load the saved model and label encoders
with open('trained_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('label_encoder_dept.pkl', 'rb') as f:
    le_dept = pickle.load(f)

with open('label_encoder_year.pkl', 'rb') as f:
    le_year = pickle.load(f)

# Dummy data for the other features (you can modify this part or take inputs from the user)
# For the purpose of this example, I will assume fixed values for these.
# Ideally, you'd also want to ask the user for inputs for `total_spent`, `complaints_handled`, and `avg_cost`
default_total_spent = 10000.0
default_complaints_handled = 50
default_avg_cost = 100.0

# Function to predict performance for a given department, month, and year
def predict_performance(department, month, year):
    # Check if the department exists in the data
    if department not in df['department'].values:
        print(f"Sorry, the department '{department}' is not available.")
        return
    
    # Label encode the department and year
    department_encoded = le_dept.transform([department])[0]
    year_encoded = le_year.transform([year])[0]
    
    # Prepare the input features for prediction (including default values for other features)
    X_input = np.array([[department_encoded, year_encoded, default_total_spent, default_complaints_handled, default_avg_cost, month]])

    # Make the prediction using the trained model
    prediction = model.predict(X_input)
    
    # Convert the numeric prediction back to the original 'Good' or 'Bad' performance
    performance_mapping = {1: 'Good', 0: 'Bad'}
    predicted_performance = performance_mapping[prediction[0]]

    print(f"Predicted Performance for {department} in {month}/{year}: {predicted_performance}")

# Example usage:

month_input = int(input("Enter the month (1-12): "))
year_input = int(input("Enter the year (e.g., 2023): "))

# Predict the performance for the given inputs
predict_performance(department_input, month_input, year_input)


In [None]:
import pandas as pd
import numpy as np
import pickle

# Load the saved model and label encoders
with open('trained_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('label_encoder_dept.pkl', 'rb') as f:
    le_dept = pickle.load(f)

with open('label_encoder_year.pkl', 'rb') as f:
    le_year = pickle.load(f)

# Dummy data for the other features (you can modify this part or take inputs from the user)
# For the purpose of this example, I will assume fixed values for these.
# Ideally, you'd also want to ask the user for inputs for `total_spent`, `complaints_handled`, and `avg_cost`
default_total_spent = 10000.0
default_complaints_handled = 50
default_avg_cost = 100.0

# Function to predict performance for a given department, month, and year
def predict_performance(department, month, year):
    # Check if the department exists in the data
    if department not in df['department'].values:
        print(f"Sorry, the department '{department}' is not available.")
        return
    
    # Label encode the department and year
    department_encoded = le_dept.transform([department])[0]
    year_encoded = le_year.transform([year])[0]
    
    # Prepare the input features for prediction (including default values for other features)
    X_input = np.array([[department_encoded, year_encoded, default_total_spent, default_complaints_handled, default_avg_cost, month]])

    # Make the prediction using the trained model
    prediction = model.predict(X_input)
    
    # Convert the numeric prediction back to the original 'Good' or 'Bad' performance
    performance_mapping = {1: 'Good', 0: 'Bad'}
    predicted_performance = performance_mapping[prediction[0]]

    print(f"Predicted Performance for {department} in {month}/{year}: {predicted_performance}")

# Example usage:
month_input = int(input("Enter the month (1-12): "))
year_input = int(input("Enter the year (e.g., 2023): "))

# Predict the performance for the given inputs
predict_performance(department_input, month_input, year_input)


In [None]:
# Feature selection (use only 'month' for prediction)
X = df[['month']]  # Use only month as a feature
y = df['performance']  # The target variable

# Train-test split (same as before)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model and label encoder
with open('trained_model_month_only.pkl', 'wb') as f:
    pickle.dump(model, f)

# Make predictions
y_pred = model.predict(X_test)

# Calculate performance
accuracy = model.score(X_test, y_test)
print(f'Accuracy of the model: {accuracy * 100:.2f}%')

# Rank departments by predicted performance (with just month as input)
df_test = X_test.copy()
df_test['predicted_performance'] = y_pred
df_test['department'] = le_dept.inverse_transform(df_test['department_encoded'])

# Convert 'Good' and 'Bad' to numeric values
performance_mapping = {'Good': 1, 'Bad': 0}
df_test['predicted_performance_numeric'] = df_test['predicted_performance'].map(performance_mapping)

# Rank departments by predicted performance
ranked_departments = df_test.groupby('department')['predicted_performance_numeric'].mean().sort_values(ascending=False)

print("Departments ranked by predicted performance (based on month):")
print(ranked_departments)
