In [25]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Load the Iris flower dataset
iris_data = pd.DataFrame({
    'sepal_length': [5.1, 4.9, 4.7, 7.0, 6.4, 6.9, 6.5, 6.2, 5.9],
    'sepal_width': [3.5, 3.0, 3.2, 3.2, 3.2, 3.1, 3.0, 3.4, 3.0],
    'petal_length': [1.4, 1.4, 1.3, 4.7, 4.5, 4.9, 5.2, 5.4, 5.1],
    'petal_width': [0.2, 0.2, 0.2, 1.4, 1.5, 1.5, 2.0, 2.3, 1.8],
    'species': ['Iris setosa', 'Iris setosa', 'Iris setosa', 'Iris versicolor', 'Iris versicolor', 'Iris versicolor', 'Iris virginica', 'Iris virginica', 'Iris virginica']
})

# Split the dataset into features (X) and target (y)
X = iris_data.iloc[:, :-1]
y = iris_data.iloc[:, -1]

# One-hot encode the target variable
encoder = OneHotEncoder(sparse=False)
y = encoder.fit_transform(y.values.reshape(-1, 1))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest classifier with cross-validation
n_estimators = [10, 50, 100, 150, 200] # define the number of trees to test
max_depth = [2, 3, 4, 5, 6] # define the maximum depth of each tree to test
cv_scores = [] # list to store cross-validation scores
best_score = 0 # variable to store the best cross-validation score
best_params = None # variable to store the best parameters
for n in n_estimators:
    for d in max_depth:
        rf = RandomForestClassifier(n_estimators=n, max_depth=d, random_state=42)
        scores = cross_val_score(rf, X_train, y_train, cv=5)
        mean_score = np.mean(scores)
        cv_scores.append(mean_score)
        if mean_score > best_score:
            best_score = mean_score
            best_params = (n, d)

# Train a Random Forest classifier on the full training set using the best parameters
rf = RandomForestClassifier(n_estimators=best_params[0], max_depth=best_params[1], random_state=42)
rf.fit(X_train, y_train)

# Evaluate the classifier on the testing set
accuracy = rf.score(X_test, y_test)
print(f'Accuracy of Random Forest classifier with n_estimators={best_params[0]}, max_depth={best_params[1]}: {best_score:.2f}')

# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy}")

# Define the sample data
sample_data = np.array([6.2, 2.8, 4.8, 1.8]).reshape(1, -1)

# Scale the sample data using the scaler we fit earlier
scaled_sample_data = scaler.transform(sample_data)

# Make a prediction on the scaled sample data using the trained Random Forest classifier
predicted_species = rf.predict(scaled_sample_data)

# Inverse transform the predicted species to get the original label
predicted_species = encoder.inverse_transform(predicted_species)

# Print the predicted species
print(predicted_species)







Accuracy of Random Forest classifier with n_estimators=50, max_depth=2: 0.80
[['Iris virginica']]
