In [1]:
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
df=sns.load_dataset('diamonds')
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [3]:
X=df.drop('cut', axis=1)
y=df['cut']

In [4]:
X.shape, y.shape


((53940, 9), (53940,))

In [5]:
X_train, X_test, y_train, y_test=train_test_split(X, y, train_size=0.8, random_state=22)

In [6]:
# Define categorical and numerical features
categorical_features = X.select_dtypes(
   include=["object"]
).columns.tolist()

numerical_features = X.select_dtypes(
   include=["float64", "int64"]
).columns.tolist()

In [7]:
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(), categorical_features),
       ("num", StandardScaler(), numerical_features),
   ]
)

In [8]:
pipeline = Pipeline(
   [
       ("preprocessor", preprocessor),
       ("classifier", GradientBoostingClassifier(random_state=42)),
   ]
)


In [9]:
cv_scores=cross_val_score(pipeline, X_train, y_train, cv=5)
pipeline.fit(X_train, y_train)
y_pred=pipeline.predict(X_test)
report=classification_report(y_test, y_pred)

In [10]:
print(f'Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}')
print('\n Classification Report:')
print(report)

Mean Cross-Validation Accuracy: 0.7653

 Classification Report:
              precision    recall  f1-score   support

        Fair       0.87      0.91      0.89       307
        Good       0.82      0.60      0.70       978
       Ideal       0.82      0.92      0.87      4454
     Premium       0.69      0.85      0.76      2703
   Very Good       0.66      0.40      0.50      2346

    accuracy                           0.76     10788
   macro avg       0.77      0.74      0.74     10788
weighted avg       0.75      0.76      0.75     10788

