In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Load the CSV file
data = pd.read_csv("/content/drive/MyDrive/Datasets/apple_quality.csv")

# Take a quick look
print(data.head())
print(data.info())


   A_id      Size    Weight  Sweetness  Crunchiness  Juiciness  Ripeness  \
0   0.0 -3.970049 -2.512336   5.346330    -1.012009   1.844900  0.329840   
1   1.0 -1.195217 -2.839257   3.664059     1.588232   0.853286  0.867530   
2   2.0 -0.292024 -1.351282  -1.738429    -0.342616   2.838636 -0.038033   
3   3.0 -0.657196 -2.271627   1.324874    -0.097875   3.637970 -3.413761   
4   4.0  1.364217 -1.296612  -0.384658    -0.553006   3.030874 -1.303849   

        Acidity Quality  
0  -0.491590483    good  
1  -0.722809367    good  
2   2.621636473     bad  
3   0.790723217    good  
4   0.501984036    good  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4001 entries, 0 to 4000
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   float64
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-nu

In [None]:
# Convert Acidity to numeric (it may be read as object/string)
data["Acidity"] = pd.to_numeric(data["Acidity"], errors="coerce")

# Drop any rows with missing values (if any appear after conversion)
data = data.dropna()

# Separate features (X) and target/label (y)
X = data.drop(columns=["Quality", "A_id"])  # drop target and ID
y = data["Quality"]

print(X.head())
print(y.head())


       Size    Weight  Sweetness  Crunchiness  Juiciness  Ripeness   Acidity
0 -3.970049 -2.512336   5.346330    -1.012009   1.844900  0.329840 -0.491590
1 -1.195217 -2.839257   3.664059     1.588232   0.853286  0.867530 -0.722809
2 -0.292024 -1.351282  -1.738429    -0.342616   2.838636 -0.038033  2.621636
3 -0.657196 -2.271627   1.324874    -0.097875   3.637970 -3.413761  0.790723
4  1.364217 -1.296612  -0.384658    -0.553006   3.030874 -1.303849  0.501984
0    good
1    good
2     bad
3    good
4    good
Name: Quality, dtype: object


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y   # keep good/bad ratio same in train & test
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (3200, 7)
Test shape: (800, 7)


In [None]:
gb_model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    random_state=42
)


In [None]:
gb_model.fit(X_train, y_train)
print("Model training completed!")


Model training completed!


In [None]:
# Predict the labels for test data
y_pred = gb_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# More detailed metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Accuracy: 0.84625

Classification Report:
              precision    recall  f1-score   support

         bad       0.87      0.82      0.84       399
        good       0.83      0.87      0.85       401

    accuracy                           0.85       800
   macro avg       0.85      0.85      0.85       800
weighted avg       0.85      0.85      0.85       800


Confusion Matrix:
[[327  72]
 [ 51 350]]
