<a href="https://colab.research.google.com/github/raghav-fr/Models-for-hackathon/blob/main/FertilizerPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Fertilizer prediction**

# **Load Data**

In [73]:
import pandas as pd
df = pd.read_csv('Fertilizer Prediction.csv')
df.head()

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,32,51,41,Red,Ground Nuts,7,3,19,14-35-14
1,35,58,35,Black,Cotton,4,14,16,Urea
2,27,55,43,Sandy,Sugarcane,28,0,17,20-20
3,33,56,56,Loamy,Ground Nuts,37,5,24,28-28
4,32,70,60,Red,Ground Nuts,4,6,9,14-35-14


In [74]:
df['Crop Type'].unique()

array(['Ground Nuts', 'Cotton', 'Sugarcane', 'Wheat', 'Tobacco', 'Barley',
       'Millets', 'Pulses', 'Oil seeds', 'Maize', 'Paddy'], dtype=object)

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Temparature      100000 non-null  int64 
 1   Humidity         100000 non-null  int64 
 2   Moisture         100000 non-null  int64 
 3   Soil Type        100000 non-null  object
 4   Crop Type        100000 non-null  object
 5   Nitrogen         100000 non-null  int64 
 6   Potassium        100000 non-null  int64 
 7   Phosphorous      100000 non-null  int64 
 8   Fertilizer Name  100000 non-null  object
dtypes: int64(6), object(3)
memory usage: 6.9+ MB


# **Data Preparation**

## **Spiliting Data into x and y**

In [76]:
X = df.drop(columns=["Fertilizer Name"])
y = df["Fertilizer Name"]

## **Encoding y values**

In [77]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
label_encoder_y = LabelEncoder()

# Drop rows with missing target values before encoding
df['Fertilizer Name'] = label_encoder_y.fit_transform(y)

## **Spliting columns numerical and categorical**

In [78]:
categorical_cols = ["Soil Type", "Crop Type"]
numerical_cols = [col for col in X.columns if col not in categorical_cols]


## **Encoding x values**

In [79]:
soil_type_label_encoder = LabelEncoder()
df["Soil Type"] = soil_type_label_encoder.fit_transform(df["Soil Type"])
crop_type_label_encoder = LabelEncoder()
df["Crop Type"] = crop_type_label_encoder.fit_transform(df["Crop Type"])

In [80]:
croptype_dict = {}
for i in range(len(df["Crop Type"].unique())):
    croptype_dict[i] = crop_type_label_encoder.inverse_transform([i])[0]
print(croptype_dict)

soiltype_dict = {}
for i in range(len(df["Soil Type"].unique())):
    soiltype_dict[i] = soil_type_label_encoder.inverse_transform([i])[0]
print(soiltype_dict)

{0: 'Barley', 1: 'Cotton', 2: 'Ground Nuts', 3: 'Maize', 4: 'Millets', 5: 'Oil seeds', 6: 'Paddy', 7: 'Pulses', 8: 'Sugarcane', 9: 'Tobacco', 10: 'Wheat'}
{0: 'Black', 1: 'Clayey', 2: 'Loamy', 3: 'Red', 4: 'Sandy'}


In [81]:
X = df.drop(columns=["Fertilizer Name"])
y = df["Fertilizer Name"]

In [82]:
df.head()

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,32,51,41,3,2,7,3,19,1
1,35,58,35,0,1,4,14,16,6
2,27,55,43,4,8,28,0,17,3
3,33,56,56,2,2,37,5,24,4
4,32,70,60,3,2,4,6,9,1


In [83]:
from collections import Counter
counter = Counter(y)
counter

Counter({1: 14492, 6: 14325, 3: 14181, 4: 14232, 0: 14378, 5: 14220, 2: 14172})

In [84]:
y_cleaned = y.dropna()
x_cleaned = X.dropna()
x_cleaned.info()
y_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   Temparature  100000 non-null  int64
 1   Humidity     100000 non-null  int64
 2   Moisture     100000 non-null  int64
 3   Soil Type    100000 non-null  int64
 4   Crop Type    100000 non-null  int64
 5   Nitrogen     100000 non-null  int64
 6   Potassium    100000 non-null  int64
 7   Phosphorous  100000 non-null  int64
dtypes: int64(8)
memory usage: 6.1 MB
<class 'pandas.core.series.Series'>
RangeIndex: 100000 entries, 0 to 99999
Series name: Fertilizer Name
Non-Null Count   Dtype
--------------   -----
100000 non-null  int64
dtypes: int64(1)
memory usage: 781.4 KB


In [85]:
from imblearn.over_sampling import SMOTE
from collections import Counter
upsample = SMOTE()
x_cleaned, y_cleaned = upsample.fit_resample(x_cleaned, y_cleaned)
counter = Counter(y_cleaned)
print(counter)

Counter({1: 14492, 6: 14492, 3: 14492, 4: 14492, 0: 14492, 5: 14492, 2: 14492})


## **Spliting dataset for training**

In [86]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x_cleaned, y_cleaned, test_size=0.2, random_state=42, stratify=y_cleaned
)

In [87]:
X_train

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
38747,36,57,59,4,10,30,17,20
19897,33,57,64,1,10,12,16,4
24458,28,70,57,2,4,20,17,29
10721,28,66,54,1,3,29,0,19
58520,31,54,41,2,10,14,5,26
...,...,...,...,...,...,...,...,...
58581,33,59,65,2,0,14,9,15
84894,36,55,25,3,9,12,10,15
63323,35,61,30,2,1,33,14,20
43460,38,64,39,2,8,15,14,33


# **Model Building**

## **Random Forest Classifier**

In [88]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=18,
    n_jobs=-1
)
rf.fit(X_train, y_train)

In [89]:
y_pred = rf.predict(X_test)

In [90]:
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder_y.classes_)

print("✅ Model Training Complete!")
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n")
print(report)

✅ Model Training Complete!
Model Accuracy: 15.91%
Classification Report:

              precision    recall  f1-score   support

    10-26-26       0.15      0.16      0.16      2898
    14-35-14       0.15      0.16      0.15      2898
    17-17-17       0.17      0.18      0.17      2898
       20-20       0.16      0.16      0.16      2899
       28-28       0.16      0.15      0.16      2899
         DAP       0.16      0.15      0.15      2898
        Urea       0.16      0.15      0.16      2899

    accuracy                           0.16     20289
   macro avg       0.16      0.16      0.16     20289
weighted avg       0.16      0.16      0.16     20289



In [91]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_train)
print(f"XGBoost Accuracy: {accuracy_score(y_train, y_pred_xgb) * 100:.2f}%")


XGBoost Accuracy: 44.64%


In [92]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_train)
print(f"Decision Tree Accuracy: {accuracy_score(y_train, y_pred_dt) * 100:.2f}%")

Decision Tree Accuracy: 100.00%


In [98]:
import numpy as np
probs = rf.predict_proba(X_test)
top3_idx = np.argsort(probs, axis=1)[:, -3:]

# Fix: Access y_test by positional index using .iloc or convert to a numpy array/list
top3_correct = [y_test.iloc[i] in top3_idx[i] for i in range(len(y_test))]

top3_accuracy = np.mean(top3_correct)

print(f"Top-3 Accuracy: {top3_accuracy * 100:.2f}%")

Top-3 Accuracy: 44.71%
