In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/Datasets/apple_quality.csv")

# See first few rows
print(df.head())

# Check data types
print(df.dtypes)


   A_id      Size    Weight  Sweetness  Crunchiness  Juiciness  Ripeness  \
0   0.0 -3.970049 -2.512336   5.346330    -1.012009   1.844900  0.329840   
1   1.0 -1.195217 -2.839257   3.664059     1.588232   0.853286  0.867530   
2   2.0 -0.292024 -1.351282  -1.738429    -0.342616   2.838636 -0.038033   
3   3.0 -0.657196 -2.271627   1.324874    -0.097875   3.637970 -3.413761   
4   4.0  1.364217 -1.296612  -0.384658    -0.553006   3.030874 -1.303849   

        Acidity Quality  
0  -0.491590483    good  
1  -0.722809367    good  
2   2.621636473     bad  
3   0.790723217    good  
4   0.501984036    good  
A_id           float64
Size           float64
Weight         float64
Sweetness      float64
Crunchiness    float64
Juiciness      float64
Ripeness       float64
Acidity         object
Quality         object
dtype: object


In [None]:
df['Acidity'] = pd.to_numeric(df['Acidity'], errors='coerce')
df.dropna(subset=['Acidity'], inplace=True)

In [None]:
# Features: all columns except 'A_id' and 'Quality'
X = df.drop(columns=['A_id', 'Quality'])

# Target: Quality (good / bad)
y = df['Quality']


In [None]:
# Label encode the target (good/bad → 0/1)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Original labels:", y.unique())
print("Encoded labels:", sorted(set(y_encoded)))
print("Mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


Original labels: ['good' 'bad']
Encoded labels: [np.int64(0), np.int64(1)]
Mapping: {'bad': np.int64(0), 'good': np.int64(1)}


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,        # 20% for testing
    random_state=42,      # for same split every run
    stratify=y_encoded    # keep good/bad ratio same in train & test
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (3200, 7)
Test shape: (800, 7)


In [None]:
dt_model = DecisionTreeClassifier(
    criterion='gini',
    max_depth=4,          # limit depth
    min_samples_split=5,  # min samples to split a node
    min_samples_leaf=3,   # min samples in a leaf node
    random_state=42
)


# Train (fit) the model
dt_model.fit(X_train, y_train)


In [None]:
# Predict on test data
y_pred = dt_model.predict(X_test)


In [None]:
# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.7350

Classification Report:
              precision    recall  f1-score   support

         bad       0.84      0.58      0.69       399
        good       0.68      0.89      0.77       401

    accuracy                           0.73       800
   macro avg       0.76      0.73      0.73       800
weighted avg       0.76      0.73      0.73       800


Confusion Matrix:
[[232 167]
 [ 45 356]]
