In [62]:
import pandas as pd
import numpy as np
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [63]:
df=pd.read_csv('Crop_recommendation.csv')

In [64]:
df.columns

Index(['temperature', 'humidity', 'ph', 'water availability', 'season',
       'label'],
      dtype='object')

In [88]:
df.groupby('season').size()

season
rainy     600
spring    100
summer    300
winter    400
dtype: int64

In [66]:
df.isna().sum()

temperature           0
humidity              0
ph                    0
water availability    0
season                0
label                 0
dtype: int64

In [67]:
df.head()

Unnamed: 0,temperature,humidity,ph,water availability,season,label
0,20.879744,82.002744,6.502985,202.935536,rainy,rice
1,21.770462,80.319644,7.038096,226.655537,rainy,rice
2,23.004459,82.320763,7.840207,263.964248,rainy,rice
3,26.491096,80.158363,6.980401,242.864034,rainy,rice
4,20.130175,81.604873,7.628473,262.71734,rainy,rice


In [85]:
cat_cols = ['season']  # categorical columns
num_cols = ['temperature', 'humidity', 'ph', 'water availability']  # numerical columns



In [69]:
X = df[['temperature','humidity','ph','water availability','season']]
y = df['label']

In [83]:
X['season'] = X['season'].str.lower()
X['season'] = X['season'].str.strip().str.lower()

In [71]:
le = LabelEncoder()
y = le.fit_transform(y)

In [86]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), cat_cols), 
        ('num', 'passthrough', num_cols)
    ]
)

In [73]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [75]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [76]:
accuracy = model.score(X_test, y_test)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.9892857142857143


In [77]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)


[[22  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 22  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 14  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 18  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 20  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 23  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 39  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 22  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 21  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 20  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0 19  0  0]
 [ 0  0  0  2  0  0  0  0  0  0  0 17  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 20]]


In [78]:
joblib.dump(model, "crop_model.pkl")
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']