In [102]:
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [103]:
# Load the dataset (assuming it's already been loaded into df)
df = pd.read_csv('winequality-red.csv')


In [104]:
# Prepare the data
X = df.drop('quality', axis=1).values  # Features
y = df['quality'].values  # Target variable

In [105]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [106]:
# Create 10 subsets using ShuffleSplit
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
trees = []
scores = []

In [93]:
# Train 1 decision tree on each subset
for train_index, _ in ss.split(X_train):
    X_subset, y_subset = X_train[train_index], y_train[train_index]
    clf = DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=42)
    clf.fit(X_subset, y_subset)
    trees.append(clf)

In [94]:
# Evaluate all the trees on the test dataset
for tree in trees:
    y_pred = tree.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)

In [95]:
# Calculate the average score of the trees
average_score = np.mean(scores)
print(f'Average accuracy of the trees: {average_score}')


Average accuracy of the trees: 0.5540625


In [96]:
# Compare with the single decision tree model (assuming it's named clf_single and has been trained previously)
y_pred_single = clf.predict(X_test)
score_single = accuracy_score(y_test, y_pred_single)
print(f'Accuracy of the single decision tree: {score_single}')


Accuracy of the single decision tree: 0.540625
