In [1]:
from pathlib import Path
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, fbeta_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
ROOT_PATH = Path('.')
DATASET_PATH = ROOT_PATH / 'wine_data.csv'

In [3]:
data = pd.read_csv(DATASET_PATH)
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,11.6,0.58,0.66,2.2,0.074,10.0,47.0,1.0008,3.25,0.57,9.0,3
1,10.4,0.61,0.49,2.1,0.2,5.0,16.0,0.9994,3.16,0.63,8.4,3
2,7.4,1.185,0.0,4.25,0.097,5.0,14.0,0.9966,3.63,0.54,10.7,3
3,10.4,0.44,0.42,1.5,0.145,34.0,48.0,0.99832,3.38,0.86,9.9,3
4,8.3,1.02,0.02,3.4,0.084,6.0,11.0,0.99892,3.48,0.49,11.0,3


In [4]:
X, y = data.drop(columns=['quality']), data['quality']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
preprocessing = Pipeline((
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=6)),
))

model = Pipeline((
    ('preprocessing', preprocessing),
    ('clf', RandomForestClassifier(n_estimators=187, max_depth=31)),
))

model.fit(X_train, y_train);

In [7]:
y_pred = model.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred)
f2_dt = fbeta_score(y_test, y_pred, beta=2, average='macro')

print(f'Accuracy: {accuracy_dt:.4f}')
print(f'F2 Score: {f2_dt:.4f}')

Accuracy: 0.6224
F2 Score: 0.6203


In [8]:
with open(ROOT_PATH / 'preproc.checkpoint', 'wb') as file:
    pickle.dump(model['preprocessing'], file)

with open(ROOT_PATH / 'model.checkpoint', 'wb') as file:
    pickle.dump(model['clf'], file)

In [18]:
processed_data = model['preprocessing'].transform(X_test.iloc[0:1])
model['clf'].predict(processed_data)

array([3])