**IMPORTING ALL NECESSARY LIBRARIES FOR THE MODEL**

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from


**BASIC EDA**

In [None]:
df = pd.read_csv("/content/Crop_recommendation.csv")
print(df.head())

    N   P   K  temperature   humidity        ph    rainfall label
0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice
1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice
2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice
3  74  35  40    26.491096  80.158363  6.980401  242.864034  rice
4  78  42  42    20.130175  81.604873  7.628473  262.717340  rice


In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB
None


In [None]:
print(df.describe())

                 N            P            K  temperature     humidity  \
count  2200.000000  2200.000000  2200.000000  2200.000000  2200.000000   
mean     50.551818    53.362727    48.149091    25.616244    71.481779   
std      36.917334    32.985883    50.647931     5.063749    22.263812   
min       0.000000     5.000000     5.000000     8.825675    14.258040   
25%      21.000000    28.000000    20.000000    22.769375    60.261953   
50%      37.000000    51.000000    32.000000    25.598693    80.473146   
75%      84.250000    68.000000    49.000000    28.561654    89.948771   
max     140.000000   145.000000   205.000000    43.675493    99.981876   

                ph     rainfall  
count  2200.000000  2200.000000  
mean      6.469480   103.463655  
std       0.773938    54.958389  
min       3.504752    20.211267  
25%       5.971693    64.551686  
50%       6.425045    94.867624  
75%       6.923643   124.267508  
max       9.935091   298.560117  


In [None]:
print(df.isnull().sum()) #missing values

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64


**ENCODING**

In [None]:
# Features (X) and Target (y)
X = df.drop("label", axis=1)
y = df["label"]

# Encode crop names to numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print("Classes:", le.classes_)  # See which number = which crop


Classes: ['apple' 'banana' 'blackgram' 'chickpea' 'coconut' 'coffee' 'cotton'
 'grapes' 'jute' 'kidneybeans' 'lentil' 'maize' 'mango' 'mothbeans'
 'mungbean' 'muskmelon' 'orange' 'papaya' 'pigeonpeas' 'pomegranate'
 'rice' 'watermelon']


In [None]:
for i, cls in enumerate(le.classes_):
    print(i, "→", cls)

0 → apple
1 → banana
2 → blackgram
3 → chickpea
4 → coconut
5 → coffee
6 → cotton
7 → grapes
8 → jute
9 → kidneybeans
10 → lentil
11 → maize
12 → mango
13 → mothbeans
14 → mungbean
15 → muskmelon
16 → orange
17 → papaya
18 → pigeonpeas
19 → pomegranate
20 → rice
21 → watermelon


**TRAIN TEST SPLIT**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [None]:
print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (1760, 7)
Test set size: (440, 7)


In [None]:
print("Training set size:", y_train.shape)
print("Test set size:", y_test.shape)

Training set size: (1760,)
Test set size: (440,)


**DECISION TREE MODEL**

In [None]:
#Decision Tree:
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

y_pred_dt = dt_clf.predict(X_test)
print("Decision Tree model trained.")
print("Sample predictions (first 5):", y_pred_dt[:5])

Decision Tree model trained.
Sample predictions (first 5): [16  1  6 11 16]


**CHECKING OVERFITTING**

In [None]:
# Calculate accuracy on both sets
train_accuracy = dt_clf.score(X_train, y_train)
test_accuracy = dt_clf.score(X_test, y_test)

print("="*50)
print("OVERFITTING CHECK")
print("="*50)
print(f"Training Accuracy:   {train_accuracy:.2%}")
print(f"Test Accuracy:       {test_accuracy:.2%}")
print(f"Gap:                 {abs(train_accuracy - test_accuracy):.2%}")


# INTERPRETATION:
if train_accuracy > test_accuracy + 0.10:  # More than 10% gap
    print("RED FLAG: Severe overfitting!")
elif train_accuracy > test_accuracy + 0.05:  # 5-10% gap
    print("WARNING: Moderate overfitting")
elif abs(train_accuracy - test_accuracy) < 0.02:  # Less than 2% gap
    print("EXCELLENT: Well-generalized model!")
else:
    print("ACCEPTABLE: Minor overfitting")

OVERFITTING CHECK
Training Accuracy:   100.00%
Test Accuracy:       97.95%
Gap:                 2.05%
ACCEPTABLE: Minor overfitting


**PRUNING THE TREE**

In [None]:
# Control tree growth to prevent overfitting:
dt_better = DecisionTreeClassifier(
    random_state=42,
    max_depth=10,           # Don't let tree get too deep
    min_samples_split=20,   # Need at least 20 samples to split
    min_samples_leaf=10,    # Leaf must have at least 10 samples
    max_features='sqrt'     # Consider only sqrt(features) at each split
)

In [None]:
# Convert encoded predictions back to crop names
print("First 5 test samples predictions:")
for i in range(5):
    pred_encoded = y_pred_dt[i]
    actual_encoded = y_test.iloc[i] if hasattr(y_test, 'iloc') else y_test[i]

    pred_crop = le.inverse_transform([pred_encoded])[0]
    actual_crop = le.inverse_transform([actual_encoded])[0]

    correct = "✓" if pred_encoded == actual_encoded else "✗"

    print(f"Sample {i}: Predicted={pred_crop}, Actual={actual_crop} {correct}")

First 5 test samples predictions:
Sample 0: Predicted=orange, Actual=orange ✓
Sample 1: Predicted=banana, Actual=banana ✓
Sample 2: Predicted=cotton, Actual=cotton ✓
Sample 3: Predicted=maize, Actual=maize ✓
Sample 4: Predicted=orange, Actual=orange ✓


**PRUNED TREE BELOW**

In [None]:
#Pruning the tree for preventing overfitting
from sklearn.tree import DecisionTreeClassifier

# TUNED MODEL - Prevents overfitting
dt_tuned = DecisionTreeClassifier(
    random_state=42,
    # CRITICAL PARAMETERS:
    max_depth=8,              # Limit tree depth (most important!)
    min_samples_split=20,     # Need at least 20 samples to split
    min_samples_leaf=10,      # Leaf must have at least 10 samples
    max_features='sqrt',      # Consider √n features at each split
    min_impurity_decrease=0.001,  # Split only if significant improvement
    ccp_alpha=0.01            # Cost complexity pruning
)

dt_tuned.fit(X_train, y_train)

# Compare with original
print("Original vs Tuned Model:")
print(f"Original - Train: {dt_clf.score(X_train, y_train):.2%}, Test: {dt_clf.score(X_test, y_test):.2%}")
print(f"Tuned    - Train: {dt_tuned.score(X_train, y_train):.2%}, Test: {dt_tuned.score(X_test, y_test):.2%}")

Original vs Tuned Model:
Original - Train: 100.00%, Test: 97.95%
Tuned    - Train: 72.05%, Test: 71.82%


In [None]:
# Calculate accuracy on both sets
train_accuracy = dt_tuned.score(X_train, y_train)
test_accuracy = dt_tuned.score(X_test, y_test)

print("="*50)
print("OVERFITTING CHECK")
print("="*50)
print(f"Training Accuracy:   {train_accuracy:.2%}")
print(f"Test Accuracy:       {test_accuracy:.2%}")
print(f"Gap:                 {abs(train_accuracy - test_accuracy):.2%}")

# INTERPRETATION:
if train_accuracy > test_accuracy + 0.10:  # More than 10% gap
    print("RED FLAG: Severe overfitting!")
elif train_accuracy > test_accuracy + 0.05:  # 5-10% gap
    print("WARNING: Moderate overfitting")
elif abs(train_accuracy - test_accuracy) < 0.02:  # Less than 2% gap
    print("EXCELLENT: Well-generalized model!")
else:
    print("ACCEPTABLE: Minor overfitting")

OVERFITTING CHECK
Training Accuracy:   72.05%
Test Accuracy:       71.82%
Gap:                 0.23%
EXCELLENT: Well-generalized model!


**RANDOM FOREST MODEL**

In [None]:
#Random Forest:
rf_clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print("Random Forest model trained.")
print("Sample predictions (first 5):", y_pred_rf[:5])



Random Forest model trained.
Sample predictions (first 5): [16  1  6 11 16]


In [None]:
# Calculate accuracy on both sets
train_accuracy = rf_clf.score(X_train, y_train)
test_accuracy = rf_clf.score(X_test, y_test)

print("="*50)
print("OVERFITTING CHECK")
print("="*50)
print(f"Training Accuracy:   {train_accuracy:.2%}")
print(f"Test Accuracy:       {test_accuracy:.2%}")
print(f"Gap:                 {abs(train_accuracy - test_accuracy):.2%}")

# INTERPRETATION:
if train_accuracy > test_accuracy + 0.10:  # More than 10% gap
    print("RED FLAG: Severe overfitting!")
elif train_accuracy > test_accuracy + 0.05:  # 5-10% gap
    print("WARNING: Moderate overfitting")
elif abs(train_accuracy - test_accuracy) < 0.02:  # Less than 2% gap
    print("EXCELLENT: Well-generalized model!")
else:
    print("ACCEPTABLE: Minor overfitting")

OVERFITTING CHECK
Training Accuracy:   100.00%
Test Accuracy:       99.55%
Gap:                 0.45%
EXCELLENT: Well-generalized model!


**USING K CROSS FOLD VALIDATION TO CHECK IF MY RANDOM FOREST MODEL WAS TRULY MEMORIZING OR NOT**

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_clf, X, y_encoded, cv=10)
print("Cross-validation scores:", scores)
print("Mean CV:", scores.mean())


Cross-validation scores: [1.         0.99545455 1.         0.98636364 0.99545455 0.99545455
 0.99545455 0.99545455 0.97727273 0.99545455]
Mean CV: 0.9936363636363638


**CLASSIFICATION METRICS**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# 1. Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(df["label"])

# 2. Train–test split
from sklearn.model_selection import train_test_split
X = df.drop("label", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# 3. Train tuned decision tree
dt_tuned.fit(X_train, y_train)

# 4. Predictions
y_pred_dt_tuned = dt_tuned.predict(X_test)

# 5. Evaluation (BOTH numeric)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred_dt_tuned))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt_tuned))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt_tuned))


Accuracy: 0.7181818181818181

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00        20
           2       0.53      0.95      0.68        20
           3       1.00      1.00      1.00        20
           4       0.36      1.00      0.53        20
           5       0.00      0.00      0.00        20
           6       0.38      1.00      0.55        20
           7       1.00      1.00      1.00        20
           8       0.48      1.00      0.65        20
           9       1.00      1.00      1.00        20
          10       0.80      1.00      0.89        20
          11       0.60      0.30      0.40        20
          12       0.75      0.30      0.43        20
          13       1.00      0.45      0.62        20
          14       1.00      1.00      1.00        20
          15       1.00      1.00      1.00        20
          16       1.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
y_pred_rf = rf_clf.predict(X_test)

print("===== Random Forest =====")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


===== Random Forest =====
Accuracy: 0.9954545454545455

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00        20
           2       1.00      0.95      0.97        20
           3       1.00      1.00      1.00        20
           4       1.00      1.00      1.00        20
           5       1.00      1.00      1.00        20
           6       1.00      1.00      1.00        20
           7       1.00      1.00      1.00        20
           8       0.95      1.00      0.98        20
           9       1.00      1.00      1.00        20
          10       1.00      1.00      1.00        20
          11       0.95      1.00      0.98        20
          12       1.00      1.00      1.00        20
          13       1.00      1.00      1.00        20
          14       1.00      1.00      1.00        20
          15       1.00      1.00      1.00        20
  

**ADDITIONALS**

In [None]:
importances = pd.Series(rf_clf.feature_importances_, index=X.columns)
importances.sort_values(ascending=False)

Unnamed: 0,0
rainfall,0.219641
humidity,0.217058
K,0.180813
P,0.151342
N,0.103356
temperature,0.075485
ph,0.052305


In [None]:
comparison_df.style.set_table_styles(
    [{'selector': 'th', 'props': [('text-align', 'center')]},
     {'selector': 'td', 'props': [('text-align', 'center')]}]
).hide(axis="index")


Model,Train Accuracy,Test Accuracy,CV Mean Accuracy
Decision Tree (Tuned),0.720455,0.718182,
Random Forest,1.0,0.995455,0.993636


In [None]:
N = float(input("Enter Nitrogen (N): "))
P = float(input("Enter Phosphorus (P): "))
K = float(input("Enter Potassium (K): "))
temperature = float(input("Enter Temperature: "))
humidity = float(input("Enter Humidity: "))
ph = float(input("Enter pH value: "))
rainfall = float(input("Enter Rainfall: "))

sample = np.array([[N, P, K, temperature, humidity, ph, rainfall]])

pred_encoded = rf_clf.predict(sample)[0]
pred_crop = le.inverse_transform([pred_encoded])[0]

print("Recommended Crop:", pred_crop)


Enter Nitrogen (N): 85
Enter Phosphorus (P): 52
Enter Potassium (K): 48
Enter Temperature: 24
Enter Humidity: 85
Enter pH value: 6.4
Enter Rainfall: 180
Recommended Crop: jute




In [None]:
import joblib

# Save model
joblib.dump(rf_clf, "crop_model.pkl")

# Save label encoder
joblib.dump(le, "label_encoder.pkl")


['label_encoder.pkl']