In [7]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset (replace 'data.csv' with the actual file path)
data = pd.read_csv('data.csv')

# Separate features (X) and target (y)
X = data.drop('target_column', axis=1)
y = data['target_column']

# Step 1: Feature Selection using SelectKBest with f_classif score
# Replace k_value with the desired number of top features to select
feature_selector = SelectKBest(score_func=f_classif, k=k_value)

# Step 2: Numerical Pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler()),  # Scale numerical columns using standardization
])

# Step 3: Categorical Pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('encoder', OneHotEncoder()),  # One-hot encode categorical columns
])

# Step 4: Combine Numerical and Categorical Pipelines using ColumnTransformer
# Specify which columns are numerical and which are categorical
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features),
])

# Step 5: Final Model using Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Create the main pipeline with preprocessor and classifier
pipeline = Pipeline([
    ('feature_selector', feature_selector),
    ('preprocessor', preprocessor),
    ('classifier', rf_classifier),
])

# Step 6: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Step 8: Evaluate the accuracy of the model on the test dataset
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'

Interpretation and Possible Improvements:

The pipeline automates the feature engineering process by selecting important features, imputing missing values, scaling numerical features, and one-hot encoding categorical features.
The Random Forest Classifier is used as the final model for classification.
The accuracy of the model on the test dataset provides an estimate of its performance.
Possible improvements include hyperparameter tuning for the Random Forest Classifier, trying other feature selection methods, experimenting with different imputation strategies, or using more advanced preprocessing techniques like feature engineering.

In [8]:
## Ans 2) from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# Load the iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Step 1: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Create individual classifiers
rf_classifier = RandomForestClassifier(random_state=42)
lr_classifier = LogisticRegression(random_state=42)

# Step 3: Create the Voting Classifier
voting_classifier = VotingClassifier(
    estimators=[('rf', rf_classifier), ('lr', lr_classifier)],
    voting='hard'  # Use majority voting
)

# Step 4: Train the Voting Classifier
voting_classifier.fit(X_train, y_train)

# Step 5: Evaluate the accuracy of the model on the test dataset
y_pred = voting_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


NameError: name 'VotingClassifier' is not defined