In [2]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score, f1_score

In [3]:
data = load_wine()

X = data.data
y = data.target

# Split into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
decision = DecisionTreeClassifier(random_state=42)
decision.fit(X_train, y_train)

predicted_values = decision.predict(X_test)

# Calculate accuracy
f1_score_decision = f1_score(y_test, predicted_values, average='weighted')
print(f"Decision Tree F1 score: {f1_score_decision:.4f}")

Decision Tree F1 score: 0.9440


In [5]:
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

predicted_values = random_forest.predict(X_test)

# Calculate accuracy
f1_score_random = f1_score(y_test, predicted_values, average='weighted')
print(f"Random Forest Tree F1 score: {f1_score_random:.4f}")

Random Forest Tree F1 score: 1.0000


In [None]:
from sklearn.model_selection import GridSearchCV

# Defining hyper parameters grid
hyperparameter_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 40],
    "min_samples_split": [2, 5, 10],
}

grid_search = GridSearchCV(
    estimator=random_forest,
    param_grid=hyperparameter_grid,
    cv=5,       # number of folds
    n_jobs=-1,  # use all the available GPU for parallelsim [-1]
    verbose=3,  # details of message displayed
    scoring="f1_weighted",
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters {best_params}")
print(f"Best F1 score {best_score:.2f}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best F1 score 0.98


In [10]:
# treating it as regression task 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

dt_regressor = DecisionTreeRegressor(random_state=42)
rf_regressor = RandomForestRegressor(random_state=42)

dt_regressor.fit(X_train, y_train)
rf_regressor.fit(X_train, y_train)

predicted_dt = dt_regressor.predict(X_test)
predicted_rf = rf_regressor.predict(X_test)

dt_mse = mean_squared_error(y_test, predicted_dt)
rf_mse = mean_squared_error(y_test, predicted_rf)

print(f"The mean squared error of decision tree {dt_mse:.2f}")
print(f"The mean squared error of random forest {rf_mse:.2f}")

The mean squared error of decision tree 0.17
The mean squared error of random forest 0.06


In [13]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_grid = {
    "n_estimators": randint(50, 200),
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 10),
}

random_search = RandomizedSearchCV(
    estimator=rf_regressor,
    param_distributions=param_grid,
    n_jobs=-1, 
    cv=5,
    n_iter=100,
    scoring="neg_mean_squared_error",
    verbose=3,
)

random_search.fit(X_train, y_train)

best_score = random_search.best_score_
best_params = random_search.best_params_

print(f"Best params : {best_params}")
print(f"Best Negative MSE score : {best_score:.3f}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best params : {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 167}
Best Negative MSE score : -0.047
