In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# 1. Load the generated dataset
df = pd.read_csv('/content/ml_project_borewell.csv')

# 2. Setup Inputs and Outputs
# Inputs: location, belong_village
X = df[['location', 'belonging_location']]

# Outputs: water_depth_m, borewell_success_rate, soil_quality, status_prediction
y_depth = df['water_depth_ft']
y_rate = df['borewell_success_rate']
y_soil = df['soil_quality']
y_status = df['status_prediction']

# 3. Encoding Categorical Data
le_loc = LabelEncoder()
le_belong = LabelEncoder()
le_soil = LabelEncoder()
le_status = LabelEncoder()

X_enc = pd.DataFrame()
X_enc['location'] = le_loc.fit_transform(X['location'])
X_enc['belonging_location'] = le_belong.fit_transform(X['belonging_location'])

y_soil_enc = le_soil.fit_transform(y_soil)
y_status_enc = le_status.fit_transform(y_status)

# 4. Training the Models
# Regressors for numerical data
model_depth = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_enc, y_depth)
model_rate = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_enc, y_rate)

# Classifiers for categorical data
model_soil = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_enc, y_soil_enc)
model_status = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_enc, y_status_enc)

# 5. Prediction Function
def get_borewell_report(village_name, hub_name):
    try:
        # Encode user input
        v_enc = le_loc.transform([village_name])[0]
        h_enc = le_belong.transform([hub_name])[0]

        # Predict
        depth = model_depth.predict([[v_enc, h_enc]])[0]
        rate = model_rate.predict([[v_enc, h_enc]])[0]
        soil_idx = model_soil.predict([[v_enc, h_enc]])[0]
        status_idx = model_status.predict([[v_enc, h_enc]])[0]

        # Decode results
        return {
            "Water Depth": f"{depth:.2f} feets",
            "Success Rate": f"{rate:.2f}%",
            "Soil Type": le_soil.inverse_transform([soil_idx])[0],
            "Prediction": le_status.inverse_transform([status_idx])[0]
        }
    except Exception as e:
        return "Village or Hub not found in training data."

In [3]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

print("\n--- Model Evaluation ---")

# 1. Evaluate Water Depth Model (Regression)
print("\nWater Depth Model (RandomForestRegressor):")
y_depth_pred = model_depth.predict(X_enc)
print(f"  R-squared: {r2_score(y_depth, y_depth_pred):.2f}")
print(f"  MAE: {mean_absolute_error(y_depth, y_depth_pred):.2f}")
print(f"  MSE: {mean_squared_error(y_depth, y_depth_pred):.2f}")

# 2. Evaluate Borewell Success Rate Model (Regression)
print("\nBorewell Success Rate Model (RandomForestRegressor):")
y_rate_pred = model_rate.predict(X_enc)
print(f"  R-squared: {r2_score(y_rate, y_rate_pred):.2f}")
print(f"  MAE: {mean_absolute_error(y_rate, y_rate_pred):.2f}")
print(f"  MSE: {mean_squared_error(y_rate, y_rate_pred):.2f}")

# 3. Evaluate Soil Quality Model (Classification)
print("\nSoil Quality Model (RandomForestClassifier):")
y_soil_pred = model_soil.predict(X_enc)
print(f"  Accuracy: {accuracy_score(y_soil_enc, y_soil_pred):.2f}")
# For multi-class classification, precision, recall, f1-score need average parameter
print(f"  Precision (macro avg): {precision_score(y_soil_enc, y_soil_pred, average='macro', zero_division=0):.2f}")
print(f"  Recall (macro avg): {recall_score(y_soil_enc, y_soil_pred, average='macro', zero_division=0):.2f}")
print(f"  F1-Score (macro avg): {f1_score(y_soil_enc, y_soil_pred, average='macro', zero_division=0):.2f}")

# 4. Evaluate Status Prediction Model (Classification)
print("\nStatus Prediction Model (RandomForestClassifier):")
y_status_pred = model_status.predict(X_enc)
print(f"  Accuracy: {accuracy_score(y_status_enc, y_status_pred):.2f}")
# For binary classification, default average is 'binary', but 'weighted' or 'macro' is safer for general use.
print(f"  Precision: {precision_score(y_status_enc, y_status_pred, average='weighted', zero_division=0):.2f}")
print(f"  Recall: {recall_score(y_status_enc, y_status_pred, average='weighted', zero_division=0):.2f}")
print(f"  F1-Score: {f1_score(y_status_enc, y_status_pred, average='weighted', zero_division=0):.2f}")


--- Model Evaluation ---

Water Depth Model (RandomForestRegressor):
  R-squared: 0.99
  MAE: 3.82
  MSE: 39.97

Borewell Success Rate Model (RandomForestRegressor):
  R-squared: 0.99
  MAE: 1.19
  MSE: 3.18

Soil Quality Model (RandomForestClassifier):
  Accuracy: 0.99
  Precision (macro avg): 0.99
  Recall (macro avg): 1.00
  F1-Score (macro avg): 0.99

Status Prediction Model (RandomForestClassifier):
  Accuracy: 1.00
  Precision: 1.00
  Recall: 1.00
  F1-Score: 1.00


In [4]:
import pickle

# Create a dictionary to hold all necessary models and encoders
model_deployment_package = {
    'model_depth': model_depth,
    'model_rate': model_rate,
    'model_soil': model_soil,
    'model_status': model_status,
    'le_loc': le_loc,
    'le_belong': le_belong,
    'le_soil': le_soil,
    'le_status': le_status
}

# Save the dictionary to a .pkl file
with open('borewell_prediction_model.pkl', 'wb') as f:
    pickle.dump(model_deployment_package, f)

print("Models and encoders successfully saved to 'borewell_prediction_model.pkl'")

Models and encoders successfully saved to 'borewell_prediction_model.pkl'
