In [None]:
Q1: Feature Engineering Pipeline
python
Copy code
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel

# Assuming 'X_train', 'y_train', 'X_test', 'y_test' are your datasets

# Feature Selection
feature_selection = SelectFromModel(RandomForestClassifier(n_estimators=100, max_depth=10))

# Numerical Pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical Pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

# Combine Numerical and Categorical Pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Final Pipeline with Feature Selection and Model
pipeline = Pipeline([
    ('feature_selection', feature_selection),
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=10))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate on the test set
accuracy = pipeline.score(X_test, y_test)

# Interpretation and Suggestions for Improvement
# Add comments or feature importance analysis as needed.
Q2: Ensemble Pipeline

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming 'X_train', 'y_train', 'X_test', 'y_test' are your datasets

# Define the classifiers
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10)
lr_classifier = LogisticRegression()

# Create the ensemble pipeline with Voting Classifier
ensemble_pipeline = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('lr', lr_classifier)
], voting='hard')

# Train the ensemble pipeline
ensemble_pipeline.fit(X_train, y_train)

# Evaluate accuracy on the test set
accuracy = accuracy_score(y_test, ensemble_pipeline.predict(X_test))
