In [None]:
import xgboost as xgb
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
crop = pl.read_csv(r"Crop_recommendation.csv")

print(crop.head())

shape: (5, 8)
┌─────┬─────┬─────┬─────────────┬───────────┬──────────┬────────────┬───────┐
│ N   ┆ P   ┆ K   ┆ temperature ┆ humidity  ┆ ph       ┆ rainfall   ┆ label │
│ --- ┆ --- ┆ --- ┆ ---         ┆ ---       ┆ ---      ┆ ---        ┆ ---   │
│ i64 ┆ i64 ┆ i64 ┆ f64         ┆ f64       ┆ f64      ┆ f64        ┆ str   │
╞═════╪═════╪═════╪═════════════╪═══════════╪══════════╪════════════╪═══════╡
│ 90  ┆ 42  ┆ 43  ┆ 20.879744   ┆ 82.002744 ┆ 6.502985 ┆ 202.935536 ┆ rice  │
│ 85  ┆ 58  ┆ 41  ┆ 21.770462   ┆ 80.319644 ┆ 7.038096 ┆ 226.655537 ┆ rice  │
│ 60  ┆ 55  ┆ 44  ┆ 23.004459   ┆ 82.320763 ┆ 7.840207 ┆ 263.964248 ┆ rice  │
│ 74  ┆ 35  ┆ 40  ┆ 26.491096   ┆ 80.158363 ┆ 6.980401 ┆ 242.864034 ┆ rice  │
│ 78  ┆ 42  ┆ 42  ┆ 20.130175   ┆ 81.604873 ┆ 7.628473 ┆ 262.71734  ┆ rice  │
└─────┴─────┴─────┴─────────────┴───────────┴──────────┴────────────┴───────┘


In [4]:
# crop.select(pl.last()) = The last column of the crop df.
output_column: np.ndarray = crop.select(pl.last()) .to_series() .to_numpy()

le = LabelEncoder().fit(output_column)

X = crop.select(pl.all().exclude('label')) .to_numpy()
y = le.transform(output_column)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [21]:
pl.DataFrame({'crop_no' : y, 'crop_name' : crop.select(pl.col('label'))}).unique(subset = 'crop_no').sort('crop_no')

crop_no,crop_name
i32,str
0,"""apple"""
1,"""banana"""
2,"""blackgram"""
3,"""chickpea"""
4,"""coconut"""
…,…
17,"""papaya"""
18,"""pigeonpeas"""
19,"""pomegranate"""
20,"""rice"""


In [9]:
parameters = {'colsample_bytree': [0.4, 0.5, 0.8], 
              'learning_rate': [0.1, 0.2, 0.01, 0.001],
              'max_depth': [3, 5, 7, 9, 12],
              'n_estimators': [100, 150, 200, 300, 500],
              'subsample': [0.4, 0.5, 0.7, 0.8],}

search = GridSearchCV(estimator = xgb.XGBClassifier(),
                      param_grid = parameters,
                      cv = 5,
                      n_jobs = -1)

search.fit(X = X, y = y) # The main dataset (X, y), not training dataset since GridSearchCV creates training and testing dataset
                         # from its own.
print(f"Best Accuracy Score = {search.best_score_}.")
print(f"\nBest Hyperparameters values : \n{search.best_params_}.")

Best Accuracy Score = 0.9981818181818181.

Best Hyperparameters values : 
{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7}.


In [17]:
clf = xgb.XGBClassifier(colsample_bytree = 0.5, # The fraction of features used for each tree.
                        learning_rate = 0.1,
                        max_depth = 5,
                        n_estimators = 100,
                        subsample = 0.7, # The fraction of samples used for each tree.
                        random_state = 62) .fit(X_train, y_train)

pred = clf.predict(np.array([[85, 40, 50, 23, 80, 6, 200]])) # Output : [20].
print("Prediction =", pred) 

print(le.inverse_transform(pred)) # Output : ['rice'] because 20 is rice.

joblib.dump(value = clf, filename = r"D:\VS CODE\ML Projects\crop_app\app\trained_models\xgb_model.pkl")
joblib.dump(value = le, filename = r"D:\VS CODE\ML Projects\crop_app\app\trained_models\labelEncoder_model.pkl")

Prediction = [20]
['rice']


['D:\\VS CODE\\ML Projects\\crop_app\\app\\trained_models\\labelEncoder_model.pkl']

In [10]:
crop.group_by(crop_name = 'label').agg(pl.all().first()).drop('label').write_csv("unique_crops_sample.csv")