Import the needed Python libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

Read the data for soybean harvest to start doing the Exploratory Analysis

In [6]:
data = pd.read_csv("data.csv")


In [3]:
data.head(10)

Unnamed: 0,Season,Cultivar,Repetition,PH,IFP,NLP,NGP,NGL,NS,MHG,GY
0,1,NEO 760 CE,1,58.8,15.2,98.2,177.8,1.81,5.2,152.2,3232.82
1,1,NEO 760 CE,2,58.6,13.4,102.0,195.0,1.85,7.2,141.69,3517.36
2,1,NEO 760 CE,3,63.4,17.2,100.4,203.0,2.02,6.8,148.81,3391.46
3,1,NEO 760 CE,4,60.27,15.27,100.2,191.93,1.89,6.4,148.5,3312.58
4,1,MANU IPRO,1,81.2,18.0,98.8,173.0,1.75,7.4,145.59,3230.99
5,1,MANU IPRO,2,75.8,20.8,69.2,128.0,1.85,7.2,154.87,3374.8
6,1,MANU IPRO,3,84.4,15.8,95.4,161.8,1.7,6.8,150.23,3182.76
7,1,MANU IPRO,4,80.47,18.2,87.8,154.27,1.77,7.13,149.9,3165.72
8,1,77HO111I2X - GUAPORÉ,1,52.2,14.4,64.8,148.8,2.3,7.2,180.25,3640.46
9,1,77HO111I2X - GUAPORÉ,2,55.8,15.8,72.0,188.8,2.62,6.2,176.75,3602.34


Check the correlation between the variables so we can proceed to answer a question

In this case we have choosen 2 variables with a strong correlation:

*   NLP. Number of Legumes per Plant
*   NS. Number of steams

In [10]:
fig = px.scatter(data, x="NLP", y="NS", color="Season",size='MHG', hover_data="Cultivar",trendline="ols", color_discrete_sequence=px.colors.qualitative.D3)
fig.show()

In [12]:
X = pd.DataFrame(data, columns=['NLP','NS'])
y = pd.Series(data['Season'], name='Season')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Train Classification Models
models = {
    'Logistic Regression': LogisticRegression(max_iter=5000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)

In [14]:
# Evaluate Model Performance
print("\nModel Performance:")
for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}:")
    print(classification_report(y_test, y_pred))


Model Performance:
Logistic Regression:
              precision    recall  f1-score   support

           1       0.78      0.88      0.82        32
           2       0.86      0.75      0.80        32

    accuracy                           0.81        64
   macro avg       0.82      0.81      0.81        64
weighted avg       0.82      0.81      0.81        64

Decision Tree:
              precision    recall  f1-score   support

           1       0.68      0.84      0.75        32
           2       0.79      0.59      0.68        32

    accuracy                           0.72        64
   macro avg       0.73      0.72      0.71        64
weighted avg       0.73      0.72      0.71        64

Random Forest:
              precision    recall  f1-score   support

           1       0.74      0.81      0.78        32
           2       0.79      0.72      0.75        32

    accuracy                           0.77        64
   macro avg       0.77      0.77      0.77        64
wei

In [15]:
# Visualize Model Comparison
model_names = list(models.keys())
accuracies = [accuracy_score(y_test, model.predict(X_test)) for model in models.values()]
accuracy_df = pd.DataFrame({'Model': model_names, 'Accuracy': accuracies})

fig = px.bar(accuracy_df, x='Model', y='Accuracy', title='Model Comparison')
fig.show()