****

1.Cross_Validation & Hyperparameter_tuning

2.Model_Selection

3.Model_Evaluation(Precision, Recall, Confusion_matrix, F1_score)

In [13]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Step 1: Problem Definition
# We'll build a classification model to predict the species of iris flowers based on their sepal and petal dimensions.

# Step 2: Data Collection
# Load the Iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target

# Step 3: Data Preprocessing
# No preprocessing required for the Iris dataset as it's already clean and well-formatted.

# Step 4: Exploratory Data Analysis (EDA)
# No need for EDA in this example, but you could explore the dataset using tools like pandas and matplotlib.

# Step 5: Feature Selection
# No feature selection needed as the Iris dataset has only four informative features.

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Model Selection
# Define a list of classifiers to consider
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

# Step 7: Hyperparameter Tuning
# Define hyperparameters for grid search for each classifier
param_grid = {
    'Logistic Regression': {'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]},  # Parameter 'C' for Logistic Regression
    'Random Forest': {'clf__n_estimators': [10, 50, 100, 200], 'clf__max_depth': [None, 5, 10, 15, 20]},
    'SVM': {'clf__C': [0.1, 1, 10, 100], 'clf__gamma': [0.001, 0.01, 0.1, 1], 'clf__kernel': ['rbf', 'linear']}
}

# Perform model selection using GridSearchCV
best_models = {}
for name, clf in classifiers.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', clf)
    ])
    grid_search = GridSearchCV(pipeline, param_grid[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_

# Step 8: Model Evaluation
# Evaluate the best model with cross-validation
for name, model in best_models.items():
    y_pred = cross_val_predict(model, X_test, y_test, cv=5)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f"{name} Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print()

# Step 9: Model Interpretation


Logistic Regression Evaluation Metrics:
Accuracy: 0.9000
Precision: 0.9033
Recall: 0.9000
F1-score: 0.9003
Confusion Matrix:
[[10  0  0]
 [ 0  8  1]
 [ 0  2  9]]

Random Forest Evaluation Metrics:
Accuracy: 0.9667
Precision: 0.9700
Recall: 0.9667
F1-score: 0.9668
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  1 10]]

SVM Evaluation Metrics:
Accuracy: 0.9667
Precision: 0.9700
Recall: 0.9667
F1-score: 0.9668
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  1 10]]

