In [19]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.metrics import accuracy_score
import numpy as np

In [20]:
# Load wine dataset
wine = load_wine()
X, y = wine.data, wine.target

In [21]:
X

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [22]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [23]:
# Split the dataset into train and test dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    "max_depth": randint(1, 10),
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 10),
    "criterion": ["gini", "entropy"]
}

In [25]:
# creating decision tree classifier model

dt_clf = DecisionTreeClassifier(random_state=42)

random_search = RandomizedSearchCV(dt_clf, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

In [26]:
print("Best hyperparameters found:")
print(random_search.best_params_)

Best hyperparameters found:
{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 6}


In [27]:
# Evaluate decision tree on test dataset

dt_best = random_search.best_estimator_

y_pred_dt = dt_best.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", accuracy_dt)

Decision Tree Accuracy: 0.9444444444444444


Here we have achieved the accuracy of 94.44 %

### Grow a random forest

In [28]:
# Create 10 subsets of the training dataset

ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)


In [29]:
# Train 1 decision tree on each subset, using the best hyperparameter values found previously

forest = []

for train_index, _ in ss.split(X_train):
    X_train_sub, y_train_sub = X_train[train_index], y_train[train_index]
    
    dt_sub = DecisionTreeClassifier(**random_search.best_params_, random_state=42)
    
    dt_sub.fit(X_train_sub, y_train_sub)
    forest.append(dt_sub)

In [30]:
# Evaluate all trees on the test dataset

y_pred_rf = np.zeros((len(X_test), len(forest)))

for i, dt_sub in enumerate(forest):
    y_pred_rf[:, i] = dt_sub.predict(X_test)

In [31]:
# Aggregate predictions from all trees

y_pred_rf_aggregate = np.round(np.mean(y_pred_rf, axis=1))
accuracy_rf = accuracy_score(y_test, y_pred_rf_aggregate)
print("Random Forest Accuracy:", accuracy_rf)

Random Forest Accuracy: 0.9722222222222222


Here we have achieved the accuracy of 97.22 % on test datasets