In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import pickle

# Load the dataset
df = pd.read_csv('formatted_fiscal_logs.csv')

# Data Preparation
# Handle 'month' column if it's a full month name like 'January 2024'
if df['month'].dtype == 'object':  # Check if 'month' is a string
    # Split into month name and year if the format is 'January 2024'
    df[['month_name', 'year']] = df['month'].str.split(' ', expand=True)
    
    # Convert month name to numeric month (1 = January, 2 = February, etc.)
    df['month'] = pd.to_datetime(df['month_name'], format='%B').dt.month
    
    # Drop 'month_name' column
    df = df.drop(columns=['month_name'])

# Label Encoding for 'year' (if year is categorical, otherwise it's numeric)
le_year = LabelEncoder()
df['year_encoded'] = le_year.fit_transform(df['year'])

# Label Encoding for 'department'
le_dept = LabelEncoder()
df['department_encoded'] = le_dept.fit_transform(df['department'])

# Feature selection (we'll use encoded department, encoded year, and other numeric features)
X = df[['department_encoded', 'year_encoded', 'total_spent', 'complaints_handled', 'avg_cost', 'month']]
y = df['performance']  # Assuming 'performance' is the target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Training - Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model to a .pkl file
with open('trained_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the label encoders to pickle files for later use
with open('label_encoder_dept.pkl', 'wb') as f:
    pickle.dump(le_dept, f)

with open('label_encoder_year.pkl', 'wb') as f:
    pickle.dump(le_year, f)

# Prediction & Insights
# Predict on test data
y_pred = model.predict(X_test)

# Calculate performance (accuracy, etc.)
accuracy = model.score(X_test, y_test)
print(f'Accuracy of the model: {accuracy * 100:.2f}%')

# Rank departments by predicted performance
df_test = X_test.copy()
df_test['predicted_performance'] = y_pred
df_test['department'] = le_dept.inverse_transform(df_test['department_encoded'])

ranked_departments = df_test.groupby('department')['predicted_performance'].mean().sort_values(ascending=False)
print("Departments ranked by predicted performance:")
print(ranked_departments)

# Identify departments with consistently low performance (e.g., below average prediction)
threshold = ranked_departments.median()
low_performance_departments = ranked_departments[ranked_departments < threshold]
print(f"Departments with consistently low performance: {low_performance_departments}") 



import pandas as pd

# Assuming df_test is already created and contains 'predicted_performance'

# Convert 'Good' and 'Bad' to numeric values
performance_mapping = {'Good': 1, 'Bad': 0}
df_test['predicted_performance_numeric'] = df_test['predicted_performance'].map(performance_mapping)

# Now you can group by 'department' and calculate the mean
ranked_departments = df_test.groupby('department')['predicted_performance_numeric'].mean().sort_values(ascending=False)

# Display the ranked departments
print("Departments ranked by predicted performance:")
print(ranked_departments)



Accuracy of the model: 100.00%


TypeError: agg function failed [how->mean,dtype->object]

In [12]:
import pandas as pd

# Load the dataset
df = pd.read_csv('formatted_fiscal_logs.csv')

# Example logic for performance: If total_spent is high and complaints_handled are low, mark as 'Good', else 'Bad'
df['performance'] = df.apply(lambda row: 'Good' if row['total_spent'] > 1000 and row['complaints_handled'] < 50 else 'Bad', axis=1)

# Save the updated DataFrame back to CSV
df.to_csv('formatted_fiscal_logs.csv', index=False)
