### exploring the dataset

In [53]:
import pandas as pd
import numpy as np

In [54]:
# Load the dataset
df = pd.read_csv('crop_db.csv')

In [55]:
# View the first few rows
print(df.head())

    N   P   K  temperature   humidity        ph    rainfall label
0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice
1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice
2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice
3  74  35  40    26.491096  80.158363  6.980401  242.864034  rice
4  78  42  42    20.130175  81.604873  7.628473  262.717340  rice


In [56]:
# Get a summary of the dataset (types, null values, etc.)
print(df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB
None


In [57]:
# Check for any missing values
print(df.isnull().sum())


N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64


In [58]:

# Get basic statistics of the dataset
print(df.describe())


                 N            P            K  temperature     humidity  \
count  2200.000000  2200.000000  2200.000000  2200.000000  2200.000000   
mean     50.551818    53.362727    48.149091    25.616244    71.481779   
std      36.917334    32.985883    50.647931     5.063749    22.263812   
min       0.000000     5.000000     5.000000     8.825675    14.258040   
25%      21.000000    28.000000    20.000000    22.769375    60.261953   
50%      37.000000    51.000000    32.000000    25.598693    80.473146   
75%      84.250000    68.000000    49.000000    28.561654    89.948771   
max     140.000000   145.000000   205.000000    43.675493    99.981876   

                ph     rainfall  
count  2200.000000  2200.000000  
mean      6.469480   103.463655  
std       0.773938    54.958389  
min       3.504752    20.211267  
25%       5.971693    64.551686  
50%       6.425045    94.867624  
75%       6.923643   124.267508  
max       9.935091   298.560117  


In [59]:
# Check the distribution of the 'label' (crop names)
print(df['label'].value_counts())

label
rice           100
maize          100
chickpea       100
kidneybeans    100
pigeonpeas     100
mothbeans      100
mungbean       100
blackgram      100
lentil         100
pomegranate    100
banana         100
mango          100
grapes         100
watermelon     100
muskmelon      100
apple          100
orange         100
papaya         100
coconut        100
cotton         100
jute           100
coffee         100
Name: count, dtype: int64


### Processing the data

In [60]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder


# Separate features (X) and target (y)
X = df.drop('label', axis=1)
y = df['label']

In [61]:
# Convert categorical target to numerical (Label Encoding)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)



In [62]:
# Split the data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [63]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Building the model for training

In [64]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Try Gradient Boosting
gbc = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3)
gbc.fit(X_train_scaled, y_train)
y_pred_gbc = gbc.predict(X_test_scaled)
print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, y_pred_gbc):.4f}")
print("Gradient Boosting Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gbc))



Gradient Boosting Accuracy: 0.9773
Gradient Boosting Confusion Matrix:
 [[23  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 20  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 26  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 26  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 21  0  0  0  0  0  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  0  0  0  0 20  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 11  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0  0 20  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 19  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0

In [69]:

# Set a confidence threshold (e.g., 0.7 for 70% confidence)
confidence_threshold = 0.7

# Sample input for testing
test_input = np.array([[10, 3, 0, 25, 90, 6, 00]])

# Scaling the test input using the same scaler
test_input_scaled = scaler.transform(test_input)

# Predicting the probabilities for each class using Gradient Boosting Classifier
predicted_probs = gbc.predict_proba(test_input_scaled)

# Get the class with the highest probability and the corresponding probability value
predicted_crop_index = np.argmax(predicted_probs, axis=1)
max_prob = np.max(predicted_probs)

# Check if the highest probability is above the confidence threshold
if max_prob >= confidence_threshold:
    # Convert numerical prediction back to the original crop label
    predicted_crop = le.inverse_transform(predicted_crop_index)
    print(f"The recommended crop for the given input is: {predicted_crop[0]} with confidence {max_prob:.2f}")
else:
    # If the confidence is too low, output "No crop found"
    print("No crop found")


No crop found




In [68]:
import pickle

# Save the trained Random Forest model
pickle.dump(rf, open('RFmodel.pkl', 'wb'))

# Save the trained Gradient Boosting model
pickle.dump(gbc, open('GBmodel.pkl', 'wb'))

# Save the scaler (StandardScaler, MinMaxScaler, etc.)
pickle.dump(scaler, open('scaler.pkl', 'wb'))

# Save the LabelEncoder
pickle.dump(le, open('label_encoder.pkl', 'wb'))

print("Models and scaler saved successfully!")


Models and scaler saved successfully!


In [77]:
df = pd.read_csv('crop_db.csv')
#temperature,humidity,ph,rainfall,label

min_value = df['label'].min()
max_value = df['label'].max()

print('Min: ', min_value)
print('Max: ', max_value)


Min:  apple
Max:  watermelon
