In [54]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the dataset
crops = pd.read_csv("soil_measures.csv")

In [55]:
crops.shape #checking shape to know the number of rows and columns

(2200, 5)

In [56]:
print(crops.isna().sum().sort_values()) #checking for null values

N       0
P       0
K       0
ph      0
crop    0
dtype: int64


In [57]:
print(crops.head) #getting a general idea of the data given

<bound method NDFrame.head of         N   P   K        ph    crop
0      90  42  43  6.502985    rice
1      85  58  41  7.038096    rice
2      60  55  44  7.840207    rice
3      74  35  40  6.980401    rice
4      78  42  42  7.628473    rice
...   ...  ..  ..       ...     ...
2195  107  34  32  6.780064  coffee
2196   99  15  27  6.086922  coffee
2197  118  33  30  6.362608  coffee
2198  117  32  34  6.758793  coffee
2199  104  18  30  6.779833  coffee

[2200 rows x 5 columns]>


In [58]:
results = {}
features = ['N', 'P', 'K', 'ph']
#this for loop goes through each feature and fits a logistic regression model 
#and calculates the f1 score for that model as the metric and stores the results in a dictionary
for feature in features: 
    X = crops[feature].values.reshape(-1, 1)  # Reshape X to be 2D
    y = crops['crop']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    f1Score = metrics.f1_score(y_test, y_pred, average = "weighted")
    results[feature] = f1Score

In [59]:
best_predictive_feature = {}
#sorting the results array to put the best feature as the first element
sorted_results = sorted(results.items(), key=lambda item: item[1], reverse=True)
#adding the best feature to our best_predictive_feature array
best_predictive_feature[sorted_results[0][0]] = sorted_results[0][1]
#printing the final result
print("The best predictive feature is '{}' with an f1-score of '{}'".format(
    list(best_predictive_feature.keys())[0], list(best_predictive_feature.values())[0])) 

The best predictive feature is 'K' with an f1-score of '0.2584384395240381'
