In [1]:
import joblib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

plt.rcParams['figure.constrained_layout.use'] = True

In [2]:
RandomStateNumber = 13

In [3]:
le = joblib.load("fertilizer_classification/le_classes.pkl")

In [4]:
train = pd.read_csv("fertilizer_classification/fertilizer_train.csv")
test = pd.read_csv("fertilizer_classification/fertilizer_test.csv")
train.head()

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP


In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, MinMaxScaler, LabelEncoder
from sklearn import set_config
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, recall_score
from sklearn.metrics import roc_curve, precision_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import xgboost as xgb

import time

set_config(transform_output="pandas")


In [6]:
train.head()
X = train.drop(["Fertilizer Name"], axis = 1)
y = train["Fertilizer Name"]

# Load model

In [7]:
# Feature engineering: add Power (W) feature in the data
def feature_engineer(df):
    df = df.copy()
    df["temp_humidity"] = df["Temparature"] * df["Humidity"]
    df["temp_moisture"] = df["Temparature"] * df["Moisture"]
    df["humidity_moisture"] = df["Humidity"] * df["Moisture"]
    df["NPK"] = df["Nitrogen"] + df["Potassium"] + df["Phosphorous"]
    df["Nitrogen_ratio"] = df["Nitrogen"] / df["NPK"]
    df["Potassium_ratio"] = df["Potassium"] / df["NPK"]
    df["Phosphorous_ratio"] = df["Phosphorous"] / df["NPK"]

    return df

In [8]:
feature_transformer = FunctionTransformer(feature_engineer)

numeric_features = [
    "Temparature", "Humidity", "Moisture",
    "Nitrogen", "Potassium", "Phosphorous",
    "temp_humidity", "temp_moisture", "humidity_moisture",
    "Nitrogen_ratio", "Potassium_ratio", "Phosphorous_ratio"
]

categorical_features = [
    "Soil Type", "Crop Type"
]

# Column Transformer for the different transformation
numericColumns = ColumnTransformer(
    transformers = [
        ('num', MinMaxScaler(), numeric_features),
    ],
    remainder = "passthrough",
    verbose_feature_names_out= False
)


transformColumns = ColumnTransformer(
    transformers =[
        ('num', MinMaxScaler(), numeric_features),
         ('cat', OneHotEncoder(sparse_output= False), categorical_features)
         ],
    verbose_feature_names_out= False)

chartprocessing = Pipeline(
    [
        ('feature_engineering', feature_transformer),
        ('numeric_Columns', numericColumns)
        ]
    )

preprocessing = Pipeline(
    [
        ('feature_engineering', feature_transformer),
        ('transformColumns', transformColumns)
        ]
    )

# # Create pipeline
pipe = Pipeline([
    ('classifier', RandomForestClassifier(random_state= RandomStateNumber)) # Placeholder model
    ,
])

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size= 0.2, random_state= RandomStateNumber, stratify= y)


In [10]:
# preprocess train & test data
# X_train_scaled = preprocessing.fit_transform(X_train)
# test_scaled = preprocessing.transform(test)

In [11]:
preprocessing = joblib.load("fertilizer_classification/preprocessing.pkl")
test_scaled = preprocessing.transform(test)

In [12]:
model = joblib.load("fertilizer_classification/best_model.pkl")

In [13]:
y_pred_proba = model.predict_proba(test_scaled)
y_pred_proba

array([[0.15302137, 0.10168125, 0.13534158, ..., 0.18721712, 0.1691439 ,
        0.1218495 ],
       [0.16817959, 0.08149341, 0.24705814, ..., 0.1049432 , 0.0983545 ,
        0.15420109],
       [0.18400468, 0.11482628, 0.11718237, ..., 0.1123246 , 0.13350767,
        0.13309936],
       ...,
       [0.12504932, 0.14692225, 0.1394007 , ..., 0.09650507, 0.19523191,
        0.15509555],
       [0.21272847, 0.12741487, 0.1605332 , ..., 0.14213422, 0.19221094,
        0.09174139],
       [0.15001568, 0.25904343, 0.20452116, ..., 0.10876306, 0.05555468,
        0.08534177]], dtype=float32)

In [14]:
top3_indices_test = np.argsort(y_pred_proba, axis=1)[:, -3:][:, ::-1]

In [15]:
classes = le.classes_[top3_indices_test]

In [23]:
submission = pd.read_csv("fertilizer_classification/sample_submission.csv")
submission["Fertilizer Name"] = [" ".join(row) for row in classes]
submission.to_csv("submit.csv", index=False)

In [17]:
result_df = pd.DataFrame()
result_df["id"] = test["id"]
result_df

Unnamed: 0,id
0,750000
1,750001
2,750002
3,750003
4,750004
...,...
249995,999995
249996,999996
249997,999997
249998,999998


In [18]:
classes = le.classes_[top3_indices_test]
classes.shape

(250000, 3)

In [19]:
result_df["fertilizer"] = [" ".join(row) for row in classes]
result_df

Unnamed: 0,id,fertilizer
0,750000,28-28 DAP 10-26-26
1,750001,17-17-17 10-26-26 Urea
2,750002,20-20 10-26-26 DAP
3,750003,14-35-14 17-17-17 DAP
4,750004,20-20 10-26-26 28-28
...,...,...
249995,999995,28-28 17-17-17 14-35-14
249996,999996,10-26-26 14-35-14 Urea
249997,999997,DAP Urea 14-35-14
249998,999998,10-26-26 DAP 17-17-17


In [20]:
classes.tolist()

[['28-28', 'DAP', '10-26-26'],
 ['17-17-17', '10-26-26', 'Urea'],
 ['20-20', '10-26-26', 'DAP'],
 ['14-35-14', '17-17-17', 'DAP'],
 ['20-20', '10-26-26', '28-28'],
 ['28-28', '20-20', '14-35-14'],
 ['28-28', '17-17-17', '14-35-14'],
 ['28-28', '17-17-17', '10-26-26'],
 ['10-26-26', '20-20', '14-35-14'],
 ['20-20', '10-26-26', '17-17-17'],
 ['17-17-17', '14-35-14', '10-26-26'],
 ['DAP', '28-28', '14-35-14'],
 ['20-20', '14-35-14', 'Urea'],
 ['28-28', '17-17-17', '20-20'],
 ['Urea', 'DAP', '28-28'],
 ['20-20', '17-17-17', '14-35-14'],
 ['10-26-26', '28-28', '20-20'],
 ['Urea', '10-26-26', '14-35-14'],
 ['17-17-17', '10-26-26', '28-28'],
 ['14-35-14', 'DAP', '10-26-26'],
 ['10-26-26', '14-35-14', '17-17-17'],
 ['28-28', '10-26-26', '14-35-14'],
 ['10-26-26', '17-17-17', '28-28'],
 ['20-20', '14-35-14', '10-26-26'],
 ['17-17-17', '14-35-14', '28-28'],
 ['14-35-14', '17-17-17', '10-26-26'],
 ['10-26-26', '20-20', '14-35-14'],
 ['10-26-26', 'DAP', 'Urea'],
 ['17-17-17', '28-28', '10-26-26'],

In [21]:
string_representation = " ".join([str(item) for item in classes.tolist()])
string_representation

"['28-28', 'DAP', '10-26-26'] ['17-17-17', '10-26-26', 'Urea'] ['20-20', '10-26-26', 'DAP'] ['14-35-14', '17-17-17', 'DAP'] ['20-20', '10-26-26', '28-28'] ['28-28', '20-20', '14-35-14'] ['28-28', '17-17-17', '14-35-14'] ['28-28', '17-17-17', '10-26-26'] ['10-26-26', '20-20', '14-35-14'] ['20-20', '10-26-26', '17-17-17'] ['17-17-17', '14-35-14', '10-26-26'] ['DAP', '28-28', '14-35-14'] ['20-20', '14-35-14', 'Urea'] ['28-28', '17-17-17', '20-20'] ['Urea', 'DAP', '28-28'] ['20-20', '17-17-17', '14-35-14'] ['10-26-26', '28-28', '20-20'] ['Urea', '10-26-26', '14-35-14'] ['17-17-17', '10-26-26', '28-28'] ['14-35-14', 'DAP', '10-26-26'] ['10-26-26', '14-35-14', '17-17-17'] ['28-28', '10-26-26', '14-35-14'] ['10-26-26', '17-17-17', '28-28'] ['20-20', '14-35-14', '10-26-26'] ['17-17-17', '14-35-14', '28-28'] ['14-35-14', '17-17-17', '10-26-26'] ['10-26-26', '20-20', '14-35-14'] ['10-26-26', 'DAP', 'Urea'] ['17-17-17', '28-28', '10-26-26'] ['20-20', '17-17-17', '14-35-14'] ['10-26-26', '20-20', 

In [22]:
test.head()

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,750000,31,70,52,Sandy,Wheat,34,11,24
1,750001,27,62,45,Red,Sugarcane,30,14,15
2,750002,28,72,28,Clayey,Ground Nuts,14,15,4
3,750003,37,53,57,Black,Ground Nuts,18,17,36
4,750004,31,55,32,Red,Pulses,13,19,14
