In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import joblib

# Sample Data (CSV)
data = {
    "department": ["Sanitation Department", "Sanitation Department", "Sanitation Department", "Sanitation Department", "Sanitation Department", "Sanitation Department", "Sanitation Department"],
    "area": ["Hadapsar", "Shivajinagar", "Kothrud", "Aundh", "Baner", "Kharadi", "Wagholi"],
    "total_budget": [80649, 87717, 63143, 93297, 99275, 79298, 79075],
    "used_budget": [16738, 22886, 32444, 42809, 31482, 24672, 25513],
    "remaining_budget": [63911, 64831, 30699, 50488, 67793, 54626, 53562],
    "garbage_fund": [20000, None, None, None, None, None, None]  # Only Hadapsar has a garbage fund value in this example
}

# Create DataFrame
df = pd.DataFrame(data)

# Initialize LabelEncoder for department and area
label_encoder_department = LabelEncoder()
label_encoder_area = LabelEncoder()

# Encode department and area columns
df['department_encoded'] = label_encoder_department.fit_transform(df['department'])
df['area_encoded'] = label_encoder_area.fit_transform(df['area'])

# Features (independent variables)
X = df[['department_encoded', 'area_encoded', 'total_budget', 'used_budget']]

# Target (dependent variable)
y = df['garbage_fund']

# Filter out rows where garbage_fund is NaN (so we only train on available data)
X_train = X[~y.isna()]
y_train = y.dropna()

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Save the trained model and label encoders
joblib.dump(model, "garbage_fund_predictor_model.pkl")
joblib.dump(label_encoder_department, "label_encoder_department.pkl")
joblib.dump(label_encoder_area, "label_encoder_area.pkl")

# Evaluate the model
print(f"Model trained. R^2 score: {model.score(X_train, y_train)}")


Model trained. R^2 score: nan


