In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import joblib
import sklearn
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib


In [2]:
print(" AGRISIGHT ANALYTICS - MACHINE LEARNING MODELS")


 AGRISIGHT ANALYTICS - MACHINE LEARNING MODELS


In [4]:
df = pd.read_csv(r"C:\Users\radha\AgriSight-Analytics-Platform\notebook\crop_data_with_features.csv")

In [5]:
df.shape

(2200, 14)

In [6]:
df['label'].unique(),df['label'].nunique()

(array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
        'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
        'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
        'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
       dtype=object),
 22)

In [7]:
print(df.dtypes)

N                           float64
P                           float64
K                           float64
temperature                 float64
humidity                    float64
ph                          float64
rainfall                    float64
label                        object
npk_total                   float64
npk_balance_score           float64
climate_zone                 object
ph_level                     object
rainfall_category            object
growing_conditions_score    float64
dtype: object


In [8]:
print(df['label'].value_counts())

label
rice           100
maize          100
jute           100
cotton         100
coconut        100
papaya         100
orange         100
apple          100
muskmelon      100
watermelon     100
grapes         100
mango          100
banana         100
pomegranate    100
lentil         100
blackgram      100
mungbean       100
mothbeans      100
pigeonpeas     100
kidneybeans    100
chickpea       100
coffee         100
Name: count, dtype: int64


In [9]:
original_features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
engineered_features = ['npk_total', 'npk_balance_score', 'growing_conditions_score']
categorical_features = ['climate_zone', 'ph_level', 'rainfall_category']
all_numeric_features = original_features + engineered_features

In [10]:
df_ml = df.copy()
# Prepare the dataset for machine learning

In [11]:
categorical_encoders = {}
for col in categorical_features:
       le = LabelEncoder()
       df_ml[col + '_encoded'] = le.fit_transform(df_ml[col].astype(str)) 
       categorical_encoders[col] = le
       print(f"   ✅ Encoded {col}: {len(le.classes_)} categories")

   ✅ Encoded climate_zone: 4 categories
   ✅ Encoded ph_level: 4 categories
   ✅ Encoded rainfall_category: 3 categories


In [12]:
feature_sets = {
    'original_only': original_features,
    'engineered_only': original_features + engineered_features,
    'with_categorical': original_features + engineered_features + [f"{col}_encoded" for col in categorical_features],
    'all_features': all_numeric_features + [f"{col}_encoded" for col in categorical_features]
}

In [13]:
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(df_ml['label'])

In [14]:
print(f"   Encoded range: 0 to {max(y)}")

   Encoded range: 0 to 21


In [15]:
# Train- Test Split

In [16]:
main_features = feature_sets['all_features']
X = df_ml[main_features]

In [17]:
print(f"📊 Feature matrix shape: {X.shape}")
print(f"🎯 Target vector shape: {y.shape}")

📊 Feature matrix shape: (2200, 13)
🎯 Target vector shape: (2200,)


In [18]:
missing_features = X.isnull().sum()
print(missing_features)

N                            0
P                            0
K                            0
temperature                  0
humidity                     0
ph                           0
rainfall                     0
npk_total                    0
npk_balance_score            0
growing_conditions_score     0
climate_zone_encoded         0
ph_level_encoded             0
rainfall_category_encoded    0
dtype: int64


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
print(f" MODEL 1: RANDOM FOREST CLASSIFIER")
rf_model = RandomForestClassifier()

 MODEL 1: RANDOM FOREST CLASSIFIER


In [22]:
rf_model.fit(X_train, y_train)

In [23]:
rf_train_pred = rf_model.predict(X_train)
rf_test_pred = rf_model.predict(X_test)
rf_train_acc = accuracy_score(y_train, rf_train_pred)
rf_test_acc = accuracy_score(y_test, rf_test_pred)

In [24]:
rf_train_acc, round(rf_test_acc * 100, 2)

(1.0, 98.41)

In [25]:
#Overfitting gap

round(rf_train_acc-rf_test_acc,2)

0.02

In [26]:
rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print(rf_cv_scores)
print(f"Cross Validation:  {rf_cv_scores.mean():.4f} ± {rf_cv_scores.std()*2:.4f}")

[0.98863636 0.98863636 0.99715909 0.98863636 0.98295455]
Cross Validation:  0.9892 ± 0.0091


In [27]:
rf_model.feature_importances_

array([0.0740083 , 0.11967885, 0.13259131, 0.05021272, 0.16290314,
       0.02798828, 0.18435947, 0.09134636, 0.04194025, 0.03595273,
       0.04311728, 0.01200583, 0.02389547])

In [28]:
rf_importance = pd.DataFrame({
    'feature': main_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

In [29]:
rf_importance.head(10).sort_values('feature')

Unnamed: 0,feature,importance
2,K,0.132591
0,N,0.074008
1,P,0.119679
10,climate_zone_encoded,0.043117
9,growing_conditions_score,0.035953
4,humidity,0.162903
8,npk_balance_score,0.04194
7,npk_total,0.091346
6,rainfall,0.184359
3,temperature,0.050213


In [30]:
# Gradient Boosting Classifier

In [31]:
gb_model = GradientBoostingClassifier( n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

In [32]:
gb_model.fit(X_train, y_train)

In [33]:
gb_train_pred = gb_model.predict(X_train)
gb_test_pred = gb_model.predict(X_test)
gb_train_acc = accuracy_score(y_train, gb_train_pred)
gb_test_acc = accuracy_score(y_test, gb_test_pred)

In [34]:
print(f"   Training Accuracy: {gb_train_acc:} ({gb_train_acc*100:.2f}%)")
print(f"   Test Accuracy: {gb_test_acc:} ({gb_test_acc*100:.2f}%)")
print(f"   Overfitting Gap: {(gb_train_acc - gb_test_acc):.2f}")

   Training Accuracy: 1.0 (100.00%)
   Test Accuracy: 0.9727272727272728 (97.27%)
   Overfitting Gap: 0.03


In [35]:
gb_cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='accuracy')


In [36]:
print(f"Cross-Validation: {gb_cv_scores.mean():} ± {gb_cv_scores.std()*2:}")


Cross-Validation: 0.9732954545454545 ± 0.019878245096063524


In [37]:
gb_importance = pd.DataFrame({
    'feature': main_features,
    'importance': gb_model.feature_importances_
}).sort_values('importance', ascending=False)

In [38]:
# Support Vector Machine

In [39]:
svm_model = SVC()

In [40]:
svm_model.fit(X_train_scaled,y_train)

In [41]:
svm_train_pred = svm_model.predict(X_train_scaled)
svm_test_pred = svm_model.predict(X_test_scaled)
svm_train_acc = accuracy_score(y_train, svm_train_pred)
svm_test_acc = accuracy_score(y_test, svm_test_pred)

In [42]:
print(f"   Training Accuracy: {svm_train_acc:} ({svm_train_acc*100:.2f}%)")
print(f"   Test Accuracy: {svm_test_acc:} ({svm_test_acc*100:.2f}%)")
print(f"   Overfitting Gap: {(svm_train_acc - svm_test_acc):.2f}")

   Training Accuracy: 0.9744318181818182 (97.44%)
   Test Accuracy: 0.9409090909090909 (94.09%)
   Overfitting Gap: 0.03


In [43]:
svm_cv_scores = cross_val_score(svm_model, X_train_scaled, y_train, cv=3, scoring='accuracy')

In [44]:
print(f"Cross-Validation: {svm_cv_scores.mean():.4f} ± {svm_cv_scores.std()*2:.4f}")


Cross-Validation: 0.9449 ± 0.0016


In [45]:
print(f"📝 Support Vectors: {svm_model.n_support_.sum()}")

📝 Support Vectors: 1015


In [46]:
models_results = {
    'Random Forest': {
        'model': rf_model,
        'train_acc': rf_train_acc,
        'test_acc': rf_test_acc,
        'cv_mean': rf_cv_scores.mean(),
        'cv_std': rf_cv_scores.std(),
        'predictions': rf_test_pred,
        'scaled_data': False
    },
    'Gradient Boosting': {
        'model': gb_model,
        'train_acc': gb_train_acc,
        'test_acc': gb_test_acc,
        'cv_mean': gb_cv_scores.mean(),
        'cv_std': gb_cv_scores.std(),
        'predictions': gb_test_pred,
        'scaled_data': False
    },
    'SVM': {
        'model': svm_model,
        'train_acc': svm_train_acc,
        'test_acc': svm_test_acc,
        'cv_mean': svm_cv_scores.mean(),
        'cv_std': svm_cv_scores.std(),
        'predictions': svm_test_pred,
        'scaled_data': True
    }
}

In [47]:
comparison_df = pd.DataFrame({
    'Model': list(models_results.keys()),
    'Training_Accuracy': [models_results[m]['train_acc'] for m in models_results.keys()],
    'Test_Accuracy': [models_results[m]['test_acc'] for m in models_results.keys()],
    'CV_Mean': [models_results[m]['cv_mean'] for m in models_results.keys()],
    'CV_Std': [models_results[m]['cv_std'] for m in models_results.keys()],
    'Overfitting_Gap': [models_results[m]['train_acc'] - models_results[m]['test_acc'] for m in models_results.keys()]
})

In [48]:
print(comparison_df.round(4).to_string(index=False))


            Model  Training_Accuracy  Test_Accuracy  CV_Mean  CV_Std  Overfitting_Gap
    Random Forest             1.0000         0.9841   0.9892  0.0045           0.0159
Gradient Boosting             1.0000         0.9727   0.9733  0.0099           0.0273
              SVM             0.9744         0.9409   0.9449  0.0008           0.0335


In [49]:
best_model_idx = comparison_df['Test_Accuracy'].idxmax()
best_model_name = comparison_df.iloc[best_model_idx]['Model']
best_test_acc = comparison_df.iloc[best_model_idx]['Test_Accuracy']

In [50]:
print(f"WINNER : {best_model_name}")
print(f"Test Accuracy: {best_test_acc:.4f} ({best_test_acc*100:.2f}%)")
print(f"CV Score: {models_results[best_model_name]['cv_mean']:.4f}")


WINNER : Random Forest
Test Accuracy: 0.9841 (98.41%)
CV Score: 0.9892


In [51]:
best_predictions = models_results[best_model_name]['predictions']

In [52]:
class_report = classification_report(y_test, best_predictions, 
                                   target_names=target_encoder.classes_, 
                                   output_dict=True)

In [53]:
crop_performance = []
for crop in target_encoder.classes_:
    if crop in class_report:
        metrics = class_report[crop]
        crop_performance.append({
            'crop': crop,
            'precision': metrics['precision'],
            'recall': metrics['recall'],
            'f1_score': metrics['f1-score'],
            'support': metrics['support']
        })

In [54]:
perf_df = pd.DataFrame(crop_performance)
perf_df = perf_df.sort_values('f1_score', ascending=False)

In [55]:
perf_df.head(5)

Unnamed: 0,crop,precision,recall,f1_score,support
0,apple,1.0,1.0,1.0,20.0
13,mothbeans,1.0,1.0,1.0,20.0
18,pigeonpeas,1.0,1.0,1.0,20.0
3,chickpea,1.0,1.0,1.0,20.0
17,papaya,1.0,1.0,1.0,20.0


In [66]:
def predict_crop_recommendation(N, P, K, temperature, humidity, ph, rainfall, 
                              climate_zone='Cool_Humid', ph_level='Neutral', 
                              rainfall_category='Medium', return_probabilities=False):
    input_data = pd.DataFrame({
        'N': [N],
        'P': [P], 
        'K': [K],
        'temperature': [temperature],
        'humidity': [humidity],
        'ph': [ph],
        'rainfall': [rainfall],
        'npk_total': [N + P + K],
        'npk_balance_score': [1 - (np.std([N, P, K]) / np.mean([N, P, K]))],
        'growing_conditions_score': [(N/140*25) + (P/140*25) + (K/140*25) + (temperature/40*25)],
        'climate_zone': [climate_zone],
        'ph_level': [ph_level],
        'rainfall_category': [rainfall_category]
    })
    for col in categorical_features:
        try:
            input_data[col + '_encoded'] = categorical_encoders[col].transform(input_data[col])
        except:
            # Handle unseen categories
            input_data[col + '_encoded'] = 0
    
    # Select features
    input_features = input_data[main_features]
    
    # Get the best model
    best_model = models_results[best_model_name]['model']
    use_scaled = models_results[best_model_name]['scaled_data']
    
    # Scale if needed
    if use_scaled:
        input_features = scaler.transform(input_features)
    
    # Make prediction
    prediction = best_model.predict(input_features)[0]
    predicted_crop = target_encoder.inverse_transform([prediction])[0]
    
    if return_probabilities and hasattr(best_model, 'predict_proba'):
        probabilities = best_model.predict_proba(input_features)[0]
        prob_dict = dict(zip(target_encoder.classes_, probabilities))
        prob_dict = dict(sorted(prob_dict.items(), key=lambda x: x[1], reverse=True))
        top_3 = list(prob_dict.items())[:3]
        return predicted_crop, top_3
    
    return predicted_crop


In [67]:
print("🧪 Testing Prediction Function:")

# Test case 1: High nitrogen for leafy crops
test1 = predict_crop_recommendation(90, 42, 43, 21, 82, 6.5, 203, return_probabilities=True)
print(f"\n Test 1 - High N conditions:")
print(f"   Predicted: {test1[0]}")
print(f"   Top 3 probabilities:")
for i, (crop, prob) in enumerate(test1[1]):
    print(f"      {i+1}. {crop}: {prob:.3f}")

# Test case 2: Balanced nutrients, warm climate
test2 = predict_crop_recommendation(50, 50, 50, 25, 65, 7.0, 150, 
                                  climate_zone='Hot_Dry', rainfall_category='Medium')
print(f"\n Test 2 - Balanced nutrients, warm climate:")
print(f"   Predicted: {test2}")

# Test case 3: Low nutrients, cool climate
test3 = predict_crop_recommendation(20, 15, 25, 18, 70, 6.0, 180,
                                  climate_zone='Cool_Humid', rainfall_category='High')
print(f"\n Test 3 - Low nutrients, cool climate:")
print(f"   Predicted: {test3}")

🧪 Testing Prediction Function:

 Test 1 - High N conditions:
   Predicted: rice
   Top 3 probabilities:
      1. rice: 0.680
      2. papaya: 0.080
      3. coffee: 0.070

 Test 2 - Balanced nutrients, warm climate:
   Predicted: papaya

 Test 3 - Low nutrients, cool climate:
   Predicted: orange


In [68]:
# Saving Model and components

In [69]:
joblib.dump(rf_model, 'random_forest_crop_model.pkl')


['random_forest_crop_model.pkl']

In [70]:
joblib.dump(gb_model, 'gradient_boosting_crop_model.pkl')


['gradient_boosting_crop_model.pkl']

In [71]:
joblib.dump(svm_model, 'svm_crop_model.pkl')

['svm_crop_model.pkl']

In [72]:
model_package = {
    'best_model_name': best_model_name,
    'best_model': models_results[best_model_name]['model'],
    'scaler': scaler,
    'target_encoder': target_encoder,
    'categorical_encoders': categorical_encoders,
    'feature_names': main_features,
    'performance_comparison': comparison_df.to_dict('records'),
    'prediction_function': predict_crop_recommendation
}

In [73]:
joblib.dump(model_package, 'complete_crop_model_package.pkl')


['complete_crop_model_package.pkl']

In [74]:
print(f"\n📋 Model Performance by Algorithm:")
for model_name, results in models_results.items():
    print(f"   {model_name:18}: {results['test_acc']*100:.4f} accuracy")


📋 Model Performance by Algorithm:
   Random Forest     : 98.4091 accuracy
   Gradient Boosting : 97.2727 accuracy
   SVM               : 94.0909 accuracy
