In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
import pandas as pd

# Change the path if needed
df = pd.read_csv("/content/drive/MyDrive/Datasets/apple_quality.csv")

# Convert 'Acidity' to numeric, coercing errors to NaN
df['Acidity'] = pd.to_numeric(df['Acidity'], errors='coerce')

# Drop rows with any missing values from the DataFrame
df.dropna(inplace=True)

# Take a quick look
print(df.head())
print(df.info())

   A_id      Size    Weight  Sweetness  Crunchiness  Juiciness  Ripeness  \
0   0.0 -3.970049 -2.512336   5.346330    -1.012009   1.844900  0.329840   
1   1.0 -1.195217 -2.839257   3.664059     1.588232   0.853286  0.867530   
2   2.0 -0.292024 -1.351282  -1.738429    -0.342616   2.838636 -0.038033   
3   3.0 -0.657196 -2.271627   1.324874    -0.097875   3.637970 -3.413761   
4   4.0  1.364217 -1.296612  -0.384658    -0.553006   3.030874 -1.303849   

    Acidity Quality  
0 -0.491590    good  
1 -0.722809    good  
2  2.621636     bad  
3  0.790723    good  
4  0.501984    good  
<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   float64
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness  

In [None]:
# Drop A_id and Quality from features
X = df.drop(columns=["A_id", "Quality"])

# Target is Quality
y = df["Quality"]

print(X.head())
print(y.value_counts())

       Size    Weight  Sweetness  Crunchiness  Juiciness  Ripeness   Acidity
0 -3.970049 -2.512336   5.346330    -1.012009   1.844900  0.329840 -0.491590
1 -1.195217 -2.839257   3.664059     1.588232   0.853286  0.867530 -0.722809
2 -0.292024 -1.351282  -1.738429    -0.342616   2.838636 -0.038033  2.621636
3 -0.657196 -2.271627   1.324874    -0.097875   3.637970 -3.413761  0.790723
4  1.364217 -1.296612  -0.384658    -0.553006   3.030874 -1.303849  0.501984
Quality
good    2004
bad     1996
Name: count, dtype: int64


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      # 20% test, 80% train
    random_state=42,    # for same split every run
    stratify=y          # keep good/bad ratio similar in train & test
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (3200, 7)
Test shape: (800, 7)


In [None]:
svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel="rbf", C=10, gamma=0.1))
])


In [None]:
svm_pipeline.fit(X_train, y_train)
print("Model training completed.")


Model training completed.


In [None]:
# Predict on test set
y_pred = svm_pipeline.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Detailed performance
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Accuracy: 0.89625

Classification Report:
              precision    recall  f1-score   support

         bad       0.89      0.90      0.90       399
        good       0.90      0.89      0.90       401

    accuracy                           0.90       800
   macro avg       0.90      0.90      0.90       800
weighted avg       0.90      0.90      0.90       800


Confusion Matrix:
[[360  39]
 [ 44 357]]
