In [26]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing


In [2]:
wine = load_wine()
X, y = wine.data, wine.target

In [3]:
X.shape, y.shape

((178, 13), (178,))

In [4]:
np.unique(y).size

3

In [5]:
X[0:5]

array([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
        3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02, 2.650e+00,
        2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, 1.860e+01, 1.010e+02, 2.800e+00,
        3.240e+00, 3.000e-01, 2.810e+00, 5.680e+00, 1.030e+00, 3.170e+00,
        1.185e+03],
       [1.437e+01, 1.950e+00, 2.500e+00, 1.680e+01, 1.130e+02, 3.850e+00,
        3.490e+00, 2.400e-01, 2.180e+00, 7.800e+00, 8.600e-01, 3.450e+00,
        1.480e+03],
       [1.324e+01, 2.590e+00, 2.870e+00, 2.100e+01, 1.180e+02, 2.800e+00,
        2.690e+00, 3.900e-01, 1.820e+00, 4.320e+00, 1.040e+00, 2.930e+00,
        7.350e+02]])

In [19]:
print(y[5:10])
print(np.unique(y).size)


[0 0 0 0 0]
3


It is a multi class classification.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, stratify = y, random_state= 42)


In [8]:
y_train[0:5], y_test[0:5]

(array([0, 0, 0, 0, 2]), array([0, 2, 0, 1, 1]))

In [9]:
tree = DecisionTreeClassifier(criterion= "gini")
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)


In [10]:
forest = RandomForestClassifier(criterion= "gini")
forest.fit(X_train, y_train)
y_pred_forest = forest.predict(X_test)

In [11]:
#Evaluation based on f1-score (Decision Tree)
f1 = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
print(f"f1 score of decision tree is: {f1}")
print(f"Accuracy of decision tree is: {accuracy* 100:.4f}")

f1 score of decision tree is: 0.9457411645054665
Accuracy of decision tree is: 94.4444


In [12]:
#Evaluation based on f1-score (Random Forest)
f1 = f1_score(y_test, y_pred_forest, average='macro')
accuracy = accuracy_score(y_test, y_pred_forest)
print(f"f1 score of random forest is: {f1}")
print(f"Accuracy of random forest is: {accuracy* 100:.4f}")

f1 score of random forest is: 1.0
Accuracy of random forest is: 100.0000


In [13]:
#Hyper parameter tuning for random forest classifier
param_grid = {
    "n_estimators": [50, 100, 150],  # Use "n_estimators" instead of "n_estimator"
    "criterion": ("gini", "entropy", "log_loss"),  # Fix the typo in "entropy"
    "max_depth": [None, 10, 20, 30],
    
}


In [14]:
rf = RandomForestClassifier(random_state= 42)
gridSearch = GridSearchCV(estimator= rf, param_grid=param_grid)


In [15]:
gridSearch.fit(X_train, y_train)

In [16]:
# Best parameters from the grid search
print("Best Parameters:", gridSearch.best_params_)

# Best score from the grid search
print("Best Cross-Validation Accuracy:", gridSearch.best_score_)

Best Parameters: {'criterion': 'gini', 'max_depth': None, 'n_estimators': 50}
Best Cross-Validation Accuracy: 0.9862068965517242


In [17]:
# Use the best model
best_model = gridSearch.best_estimator_

# Test set accuracy
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")


Test Accuracy: 1.0000


For Regression Task, in-built california housing dataset is used.

In [21]:
data = fetch_california_housing()
X, y = data.data, data.target

In [22]:
# Step 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Step 3: Define models and hyperparameter search space
rf = RandomForestRegressor(random_state=42)
dt = DecisionTreeRegressor(random_state=42)

In [24]:
#Random forest params list 
rf_param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [None]:
# Step 4: Hyperparameter tuning using RandomizedSearchCV
rf_random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_dist,
    n_iter=10,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    random_state=42
)
rf_random_search.fit(X_train, y_train)

In [27]:
# Step 5: Evaluate the best models
best_rf = rf_random_search.best_estimator_

In [28]:
#Prediction using the best estimator 
rf_predictions = best_rf.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

In [29]:
print("Random Forest Regressor Performance:")
print("Best Params:", rf_random_search.best_params_)
print("MSE:", rf_mse)
print("R2 Score:", rf_r2)

Random Forest Regressor Performance:
Best Params: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 20}
MSE: 0.24248206592874333
R2 Score: 0.8149569562212153
