In [1]:
import pandas as pd

df= pd.read_csv("dataset_lemon.csv")
df


Unnamed: 0,diameter,berat,tebal_kulit,kadar_gula,asal_daerah,musim_panen,warna,kualitas
0,57.1,105,3.7,8.4,California,Puncak,Kuning cerah,Grade A
1,60.2,118,3.8,8.2,Malang,Puncak,Kuning cerah,Grade A
2,48.5,80,4.6,7.8,Medan,Akhir,Kuning kehijauan,Grade B
3,65.8,136,5.1,7.0,California,Akhir,Hijau pekat,Reject
4,55.9,101,3.5,8.5,Malang,Awal,Kuning cerah,Grade A
...,...,...,...,...,...,...,...,...
790,54.9,98,4.2,7.7,Medan,Akhir,Kuning kehijauan,Grade B
791,67.2,141,5.7,6.8,California,Akhir,Hijau pekat,Reject
792,57.0,105,3.5,8.4,Malang,Awal,Kuning cerah,Grade A
793,51.1,89,4.5,7.9,Medan,Puncak,Kuning kehijauan,Grade B


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

X = df[["diameter","berat","tebal_kulit","kadar_gula","asal_daerah","warna","musim_panen"]]
y = df["kualitas"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

numeric_columns = ["diameter", "berat", "tebal_kulit", "kadar_gula"]
categorical_columns = ["asal_daerah","musim_panen"]
ordinal_columns = ["warna"]

warna_order = ["Hijau pekat", "Kuning kehijauan", "Kuning cerah"]
ordinal_order = [warna_order]

preprocessing=ColumnTransformer(
    transformers=[
        ("scaler",StandardScaler(), numeric_columns),
        ("ohe",OneHotEncoder(), categorical_columns),
        ("oe",OrdinalEncoder(categories=ordinal_order), ordinal_columns)
    ]
)

model = Pipeline(
    steps=[
        ("preprocessing", preprocessing),
        ("model", RandomForestClassifier())
    ]
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy :", accuracy_score(y_test, y_pred))
print("\nClassification Report :\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix :\n", confusion_matrix(y_test, y_pred))
        
scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
print("\nScores :\n", scores)
print("\nMean Scores :",scores.mean())
             

Accuracy : 1.0

Classification Report :
               precision    recall  f1-score   support

     Grade A       1.00      1.00      1.00        71
     Grade B       1.00      1.00      1.00        53
      Reject       1.00      1.00      1.00        35

    accuracy                           1.00       159
   macro avg       1.00      1.00      1.00       159
weighted avg       1.00      1.00      1.00       159


Confusion Matrix :
 [[71  0  0]
 [ 0 53  0]
 [ 0  0 35]]

Scores :
 [1. 1. 1. 1. 1.]

Mean Scores : 1.0
