In [24]:
import pandas as pd
import numpy as np
from semiq_ml.baseline_model import BaselineModel
import seaborn as sns
import matplotlib.pyplot as plt

TARGET = 'Fertilizer Name'
SEED = 42

In [25]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

In [26]:
train.info()

train = train.sample(frac=.001, random_state=SEED).reset_index(drop=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Temparature      750000 non-null  int64 
 1   Humidity         750000 non-null  int64 
 2   Moisture         750000 non-null  int64 
 3   Soil Type        750000 non-null  object
 4   Crop Type        750000 non-null  object
 5   Nitrogen         750000 non-null  int64 
 6   Potassium        750000 non-null  int64 
 7   Phosphorous      750000 non-null  int64 
 8   Fertilizer Name  750000 non-null  object
dtypes: int64(6), object(3)
memory usage: 51.5+ MB


In [27]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temparature,750.0,31.482667,3.950845,25.0,28.0,32.0,35.0,38.0
Humidity,750.0,60.834667,6.711238,50.0,55.0,61.0,67.0,72.0
Moisture,750.0,44.784,11.710288,25.0,35.0,44.5,55.0,65.0
Nitrogen,750.0,22.805333,11.172251,4.0,13.0,23.0,32.0,42.0
Potassium,750.0,9.593333,5.77816,0.0,4.25,10.0,15.0,19.0
Phosphorous,750.0,21.732,12.854279,0.0,10.0,22.0,33.0,42.0


In [28]:
obj_cols = train.select_dtypes(include=['object']).columns

for col in obj_cols:
    print(f"Unique values in {col}: {train[col].nunique()}")
    print(train[col].value_counts().head(10))

Unique values in Soil Type: 5
Soil Type
Sandy     166
Black     155
Loamy     145
Clayey    142
Red       142
Name: count, dtype: int64
Unique values in Crop Type: 11
Crop Type
Paddy          90
Pulses         90
Tobacco        69
Maize          69
Sugarcane      68
Millets        66
Barley         62
Ground Nuts    61
Cotton         61
Wheat          61
Name: count, dtype: int64
Unique values in Fertilizer Name: 7
Fertilizer Name
10-26-26    126
14-35-14    116
DAP         116
17-17-17    104
20-20       104
28-28        92
Urea         92
Name: count, dtype: int64


In [32]:
model = BaselineModel(
    task_type='classification',
    random_state=SEED,
    metric='accuracy',
    models='all'
)

In [33]:
model.fit(
    X=train.drop(columns=[TARGET]),
    y=train[TARGET],
)

2025-06-09 18:13:02,948 - INFO - Target labels are non-numeric. Applying label encoding.
2025-06-09 18:13:02,951 - INFO - Starting BaselineModel training for classification with metric: accuracy (Maximize: True)
2025-06-09 18:13:02,951 - INFO - Validation set size: 20%
2025-06-09 18:13:02,952 - INFO - Preprocessor type: general_ohe
2025-06-09 18:13:02,952 - INFO - Numeric columns: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
2025-06-09 18:13:02,952 - INFO - Categorical columns: ['Soil Type', 'Crop Type']
2025-06-09 18:13:02,965 - INFO -   Logistic Regression accuracy: 0.1133 (Training Time: 0.01s)
2025-06-09 18:13:02,965 - INFO -   --> NEW BEST model: Logistic Regression with accuracy: 0.1133
2025-06-09 18:13:02,966 - INFO - Preprocessor type: distance_kernel
2025-06-09 18:13:02,966 - INFO - Numeric columns: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
2025-06-09 18:13:02,966 - INFO - Categorical columns: ['Soil Type

<catboost.core.CatBoostClassifier at 0x797eb54be450>

In [34]:
model.get_results()

Unnamed: 0,model,score,time,preprocessor_used,status,error_message
0,CatBoost,0.18,2.940398,catboost_internal,Success,
1,Random Forest,0.166667,0.105663,general_ohe,Success,
2,LGBM,0.16,0.119076,general_ohe,Success,
3,Decision Tree,0.14,0.003382,general_ohe,Success,
4,XGBoost,0.14,0.214416,general_ohe,Success,
5,Logistic Regression,0.113333,0.013621,general_ohe,Success,
6,KNN,0.113333,0.004992,distance_kernel,Success,
7,SVC,0.1,0.077516,distance_kernel,Success,
