In [3]:
#Q1. You are working on a machine learning project where you have a dataset containing numerical and categorical features. You have identified that some of the features are highly correlated and there are missing values in some of the columns. You want to build a pipeline that automates the feature engineering process and handles the missing valuesD
#Ans.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif

# Load the dataset
df = pd.read_csv('dataset.csv')

# Separate the target variable from the features
X = df.drop('target', axis=1)
y = df['target']

# Define the column transformer for numerical and categorical features
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, X.select_dtypes(include=['float', 'int'])),
    ('cat', cat_transformer, X.select_dtypes(include=['object']))
])

# Define the feature selection method
selector = SelectKBest(score_func=f_classif, k=10)

# Define the classifier
clf = RandomForestClassifier()

# Define the pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', selector),
    ('clf', clf)
])

# Fit the pipeline on the training data
pipe.fit(X_train, y_train)

# Evaluate the pipeline on the test data
accuracy = pipe.score(X_test, y_test)
print("Accuracy:", accuracy)

In [6]:
#Q2. Build a pipeline that includes a random forest classifier and a logistic regress#on classifier, and then use a voting classifier to combine the#r predictions. Train the pipeline on the iris dataset and evaluate its accuracy.
#Ans.
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
# Assuming X is the feature matrix and y is the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Define the pipelines for the Random Forest Classifier and Logistic Regression Classifier
rf_pipeline = Pipeline([
    ('rf_imputer', SimpleImputer(strategy='mean')),
    ('rf_scaler', StandardScaler()),
    ('rf_classifier', RandomForestClassifier())
])

lr_pipeline = Pipeline([
    ('lr_imputer', SimpleImputer(strategy='most_frequent')),
    ('lr_onehot', OneHotEncoder(handle_unknown='ignore')),
    ('lr_classifier', LogisticRegression())
])

# Define the voting classifier
voting_classifier = VotingClassifier(
    estimators=[('rf', rf_pipeline), ('lr', lr_pipeline)],
    voting='soft' # using soft voting to take probabilities into account
)

# Train the voting classifier
voting_classifier.fit(X_train, y_train)

# Evaluate the accuracy of the voting classifier on the test dataset
y_pred = voting_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.8032786885245902
