In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

In [6]:
df = pd.read_csv(r'data\processed\filtered_pomegranate_mango.csv')
df.head()

Unnamed: 0,district,crop,season,sum_insured_per_hectare,actuarial_rate_percent,farmer_share_percent,total_premium,farmer_premium,govt_premium
0,Ahmadnagar,Pomegranate,Kharif,130000.0,5.0,5.0,6500.0,6500.0,0.0
1,Amravati,Pomegranate,Kharif,130000.0,5.0,5.0,6500.0,6500.0,0.0
2,Aurangabad,Pomegranate,Kharif,130000.0,10.0,5.0,13000.0,6500.0,6500.0
3,Bid,Pomegranate,Kharif,130000.0,15.0,5.0,19500.0,6500.0,13000.0
4,Buldana,Pomegranate,Kharif,130000.0,16.0,5.0,20800.0,6500.0,14300.0


In [8]:
required_columns = [
    'district', 'crop', 'season',
    'sum_insured_per_hectare', 'actuarial_rate_percent',
    'farmer_share_percent', 'total_premium',
    'farmer_premium', 'govt_premium'
]
df = df[required_columns].dropna()


X = df[['district', 'crop', 'season']]
y = df[[
    'sum_insured_per_hectare', 'actuarial_rate_percent',
    'farmer_share_percent', 'total_premium',
    'farmer_premium', 'govt_premium'
]]


categorical_features = ['district', 'crop', 'season']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42)))
])


In [10]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# fit the model
model_pipeline.fit(X_train, y_train)

# evaluate
y_pred = model_pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📊 Mean Absolute Error (average): {mae:.2f}")
print(f"📈 R2 Score (overall fit): {r2:.2f}")

# save
joblib.dump(model_pipeline, 'models\sum_insured_predictor.pkl')
print("✅ Model saved as 'sum_insured_predictor.pkl'")

📊 Mean Absolute Error (average): 2100.67
📈 R2 Score (overall fit): 0.84
✅ Model saved as 'sum_insured_predictor.pkl'
