In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Load the dataset
crops = pd.read_csv("soil_measures.csv")

crops.isna().sum()

crops.crop.unique()

X = crops.drop(columns="crop")
y = crops["crop"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)
feature_performance = {}

for feature in ["N", "P", "K", "ph"]:
    log_reg = LogisticRegression(multi_class="multinomial", max_iter=200)
    log_reg.fit(X_train[[feature]], y_train)
    y_pred = log_reg.predict(X_test[[feature]])

    f1 = metrics.f1_score(y_test, y_pred, average="weighted")

    feature_performance[feature] = f1
    print(f"F1-score for {feature}: {f1}")

# Find the best predictive feature after the loop
best_feature = max(feature_performance, key=feature_performance.get)
best_predictive_feature = {best_feature: feature_performance[best_feature]}
best_predictive_feature

F1-score for N: 0.09460822808525177
F1-score for P: 0.13596627225711835
F1-score for K: 0.25109003900054505
F1-score for ph: 0.04532731061152114


{'K': 0.25109003900054505}