# Predictive Modeling for Agriculture
Predicting crop types using soil nutrient composition (N, P, K, pH).

 ## 1. Importing the necessary libraries

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

## 2. Loading the dataset

In [2]:
crops = pd.read_csv("soil_measures.csv")


In [3]:
print(crops.head())

    N   P   K        ph  crop
0  90  42  43  6.502985  rice
1  85  58  41  7.038096  rice
2  60  55  44  7.840207  rice
3  74  35  40  6.980401  rice
4  78  42  42  7.628473  rice


In [4]:
print(crops.describe())

                 N            P            K           ph
count  2200.000000  2200.000000  2200.000000  2200.000000
mean     50.551818    53.362727    48.149091     6.469480
std      36.917334    32.985883    50.647931     0.773938
min       0.000000     5.000000     5.000000     3.504752
25%      21.000000    28.000000    20.000000     5.971693
50%      37.000000    51.000000    32.000000     6.425045
75%      84.250000    68.000000    49.000000     6.923643
max     140.000000   145.000000   205.000000     9.935091


In [5]:
print("Target distribution of crop")
print(crops['crop'].value_counts())

Target distribution of crop
crop
rice           100
maize          100
chickpea       100
kidneybeans    100
pigeonpeas     100
mothbeans      100
mungbean       100
blackgram      100
lentil         100
pomegranate    100
banana         100
mango          100
grapes         100
watermelon     100
muskmelon      100
apple          100
orange         100
papaya         100
coconut        100
cotton         100
jute           100
coffee         100
Name: count, dtype: int64


## 3. Exploring missing values

In [6]:
print(crops.isnull().sum())

N       0
P       0
K       0
ph      0
crop    0
dtype: int64


## 4. Define the features (X) and the target (Y)

In [7]:
X = crops.drop(columns="crop")
y = crops["crop"]

## 5. Split into training and test sets.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

## 6. Create dictionary for results

In [9]:
feature_performance = {}

## 7. Train a logistic regression for each feature

In [10]:
import warnings
warnings.filterwarnings("ignore")

In [11]:
for feature in ["N", "P", "K", "ph"]:
    log_reg = LogisticRegression(max_iter=1000, multi_class="multinomial")
    log_reg.fit(X_train[[feature]], y_train)
    y_pred = log_reg.predict(X_test[[feature]])

    #Evaluating the best feature with f1-score
    f1 = metrics.f1_score(y_test, y_pred, average="weighted")
    feature_performance[feature] = f1
    
    print(f"F1-score for {feature}: {f1}")

F1-score for N: 0.09675206283523911
F1-score for P: 0.11760989823570633
F1-score for K: 0.19764435585972828
F1-score for ph: 0.0458225366614312


## 8. Save the best feature

In [12]:
best_predictive_feature = {"K": feature_performance["K"]}
best_predictive_feature

{'K': np.float64(0.19764435585972828)}

## Conclusion
Among the soil features tested, **K (Potassium)** had the highest predictive power for crop classification.