# Imports

In [7]:
import pandas as pd
import numpy as np

# preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,LabelEncoder

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

# For model evaluation
from sklearn.metrics import classification_report, confusion_matrix

# Load Dataset

In [8]:
df = pd.read_csv("Crop_recommendation.csv")
df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


# Encoding

In [9]:
encoder = LabelEncoder()

df['label'] = encoder.fit_transform(df['label'])
df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,20
1,85,58,41,21.770462,80.319644,7.038096,226.655537,20
2,60,55,44,23.004459,82.320763,7.840207,263.964248,20
3,74,35,40,26.491096,80.158363,6.980401,242.864034,20
4,78,42,42,20.130175,81.604873,7.628473,262.71734,20


# Scaling: Normalization

In [10]:

# Select numeric columns to normalize
numeric_cols = ['N', 'P', 'K', 'temperature', 'humidity', 'ph','rainfall']

# Initialize scaler
scaler = MinMaxScaler()

# Fit-transform the numeric columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,0.642857,0.264286,0.19,0.345886,0.790267,0.466264,0.656458,20
1,0.607143,0.378571,0.18,0.371445,0.770633,0.54948,0.741675,20
2,0.428571,0.357143,0.195,0.406854,0.793977,0.674219,0.87571,20
3,0.528571,0.214286,0.175,0.506901,0.768751,0.540508,0.799905,20
4,0.557143,0.264286,0.185,0.324378,0.785626,0.641291,0.871231,20


# Train Test Split

In [12]:
X = df.drop('label',axis=1)
y = df['label']
# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (1760, 7)
Test shape: (440, 7)


# Training, Testing Multiple Classifiers

In [15]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
}


# Train and evaluate each model
for name, model in models.items():
    print("="*50)
    print("Model:", name)
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Print metrics
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)
    
    

Model: Logistic Regression
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        21
           2       0.89      0.85      0.87        20
           3       1.00      1.00      1.00        26
           4       1.00      1.00      1.00        27
           5       0.94      1.00      0.97        17
           6       0.94      1.00      0.97        17
           7       1.00      1.00      1.00        14
           8       0.84      0.70      0.76        23
           9       0.91      1.00      0.95        20
          10       0.69      1.00      0.81        11
          11       1.00      0.95      0.98        21
          12       0.66      1.00      0.79        19
          13       1.00      0.54      0.70        24
          14       0.86      1.00      0.93        19
          15       1.00      1.00      1.00        17
          16       1.00      1

# Selecting Best Model

In [16]:
model_gbc = GradientBoostingClassifier()

model_gbc.fit(X_train,y_train)

y_pred = model_gbc.predict(X_test)



print("confusion matrix \n: ", confusion_matrix(y_test,y_pred))
print("classification report \n: ", classification_report(y_test, y_pred))

confusion matrix 
:  [[23  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 20  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 26  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 26  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 23  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 20  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 11  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0  0 20  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 19  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0 23  0  0  0  0  0  0  0  0]
 [ 0  0  0  0

# Saving Model, Encoder, Scaler for production

In [20]:
import pickle

pickle.dump(encoder, open("models/encoder.pkl",'wb'))
pickle.dump(model_gbc,open("models/model_gbc.pkl",'wb'))
pickle.dump(scaler,open("models/scaler.pkl",'wb'))

# Inference (Prediction on new data)

In [21]:
import numpy as np

# Load encoder, scaler, and model
encoder = pickle.load(open("models/encoder.pkl", 'rb'))
scaler = pickle.load(open("models/scaler.pkl", 'rb'))  # Make sure this exists
model_gbc = pickle.load(open("models/model_gbc.pkl", 'rb'))

def predict_crop(N, P, K, temperature, humidity, ph, rainfall):
    # Create DataFrame from input values
    input_df = pd.DataFrame([[N, P, K, temperature, humidity, ph, rainfall]],
                            columns=['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall'])
    
    # Scale the data
    input_scaled = scaler.transform(input_df)
    
    # Predict and decode
    prediction_encoded = model_gbc.predict(input_scaled)
    prediction = encoder.inverse_transform(prediction_encoded)
    
    return prediction[0]

result = predict_crop(90, 40, 40, 25.0, 80.0, 6.5, 100.0)
print("Recommended Crop:", result)


Recommended Crop: jute




In [22]:
# Sample input values (at least 10 different sets)
test_inputs = [
    (90, 40, 40, 25.0, 80.0, 6.5, 100.0),
    (60, 30, 20, 22.5, 75.0, 6.0, 120.0),
    (80, 60, 50, 27.0, 82.0, 6.8, 95.0),
    (100, 45, 45, 30.0, 85.0, 7.0, 110.0),
    (70, 55, 65, 28.0, 78.0, 6.4, 105.0),
    (65, 40, 50, 26.0, 70.0, 5.8, 90.0),
    (55, 20, 25, 24.0, 72.0, 6.3, 130.0),
    (85, 65, 60, 29.0, 90.0, 6.7, 115.0),
    (95, 50, 70, 31.0, 88.0, 6.9, 102.0),
    (50, 25, 30, 23.0, 68.0, 6.1, 85.0)
]

# Loop through inputs and print predictions
for i, values in enumerate(test_inputs, 1):
    result = predict_crop(*values)
    print(f"{i}. Input: {values} -> Recommended Crop: {result}")



1. Input: (90, 40, 40, 25.0, 80.0, 6.5, 100.0) -> Recommended Crop: jute
2. Input: (60, 30, 20, 22.5, 75.0, 6.0, 120.0) -> Recommended Crop: maize
3. Input: (80, 60, 50, 27.0, 82.0, 6.8, 95.0) -> Recommended Crop: jute
4. Input: (100, 45, 45, 30.0, 85.0, 7.0, 110.0) -> Recommended Crop: cotton
5. Input: (70, 55, 65, 28.0, 78.0, 6.4, 105.0) -> Recommended Crop: jute
6. Input: (65, 40, 50, 26.0, 70.0, 5.8, 90.0) -> Recommended Crop: maize
7. Input: (55, 20, 25, 24.0, 72.0, 6.3, 130.0) -> Recommended Crop: jute
8. Input: (85, 65, 60, 29.0, 90.0, 6.7, 115.0) -> Recommended Crop: banana
9. Input: (95, 50, 70, 31.0, 88.0, 6.9, 102.0) -> Recommended Crop: jute
10. Input: (50, 25, 30, 23.0, 68.0, 6.1, 85.0) -> Recommended Crop: jute




In [23]:
import sklearn 
print(sklearn.__version__)

1.4.2
