# Step 1: Import necessary libraries

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import load_breast_cancer, load_iris
import pandas as pd
import joblib

# Step 2: Load and prepare data


In [2]:
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Step 3: Define feature engineering steps

In [3]:
# Example column names (assuming numerical features only for simplicity)
numerical_features = data.feature_names
categorical_features = []  # Add categorical feature names if any

# Define transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Step 4: Create and define the pipeline


In [4]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Step 5: Split data


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Fit the pipeline and evaluate using cross-validation


In [6]:
# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

# Perform cross-validation on training data
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean():.2f}')

Cross-validation scores: [0.95833333 1.         0.83333333 1.         0.95833333]
Mean cross-validation score: 0.95


# Step 7: Evaluate on test data


In [7]:
# Predict and evaluate on test data
y_pred = pipeline.predict(X_test)
accuracy = pipeline.score(X_test, y_test)
print(f'Test accuracy: {accuracy:.2f}')

Test accuracy: 1.00


# Step 8: Save the trained pipeline


In [8]:
# Save the pipeline to a file
joblib_file = 'trained_pipeline.pkl'
joblib.dump(pipeline, joblib_file)
print(f'Model saved to {joblib_file}')

Model saved to trained_pipeline.pkl


# Step 9: Load the trained pipeline (when needed)


In [9]:
# Load the pipeline from the file
loaded_pipeline = joblib.load(joblib_file)

# Use the loaded pipeline to make predictions
loaded_pipeline_predictions = loaded_pipeline.predict(X_test)
loaded_pipeline_accuracy = loaded_pipeline.score(X_test, y_test)
print(f'Loaded pipeline test accuracy: {loaded_pipeline_accuracy:.2f}')

Loaded pipeline test accuracy: 1.00


# Step 10: Inference with custom input

In [10]:
# New data for prediction (replace this with your own data)
new_data = pd.DataFrame({
    'sepal length (cm)': [5.1, 6.2],
    'sepal width (cm)': [3.5, 3.4],
    'petal length (cm)': [1.4, 5.4],
    'petal width (cm)': [0.2, 2.3]
})

# Predict using the loaded pipeline
predictions = loaded_pipeline.predict(new_data)
print(f'Predictions for new data: {predictions}')


Predictions for new data: [0 2]
