# ANSWER 1

In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns 

In [46]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [47]:
X = df.drop(labels=['time'],axis=1)
y= df.time

In [48]:
categorical_cols = ['sex','smoker','day']
numerical_cols = ['total_bill','size','tip']

In [49]:
num_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
            ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehotencoder',OneHotEncoder())
    ]
)

In [50]:
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [51]:
classifier = RandomForestClassifier()

final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [53]:
final_pipeline.fit(X_train, y_train)

In [54]:
y_pred = final_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.972972972972973


## Explanation of the pipeline steps:
1. Numerical Pipeline: For numerical features, we use a pipeline with an imputer that fills missing values with the mean and a scaler that standardizes the numerical features.
2. Categorical Pipeline: For categorical features, we use a pipeline with an imputer that fills missing values with the most frequent value and an encoder that performs one-hot encoding on the categorical features.
3. ColumnTransformer: We combine the numerical and categorical pipelines using ColumnTransformer. It applies the respective preprocessing steps to the corresponding feature types.
4. Final Pipeline: The final pipeline combines the preprocessing steps using ColumnTransformer and includes the Random Forest Classifier.
5. Split Data: We split the dataset into a training set (70%) and a test set (30%) using train_test_split.
6. Train Model: We fit the final pipeline on the training data to train the Random Forest Classifier.
7. Evaluate Model: We predict the target labels on the test set and calculate the accuracy of the model using accuracy_score.

## Interpretation of the Results:
The pipeline automatically performs feature engineering, handling missing values, and building a Random Forest Classifier model.The combination of numerical and categorical preprocessing handles different types of features efficiently.

## Possible Improvements for the Pipeline:
1. Hyperparameter Tuning: Perform hyperparameter tuning for the Random Forest Classifier to find the best set of hyperparameters.
2. Feature Selection: Instead of a fixed number of features (e.g., top 10), use methods like Recursive Feature Elimination (RFE) or L1 regularization to automatically select the optimal number of features.
3. Model Evaluation: Besides accuracy, evaluate the model's performance using other metrics like precision, recall, F1-score, and area under the ROC curve (AUC) to get a more comprehensive assessment.
4. Feature Engineering: Experiment with different feature engineering techniques, such as creating interaction terms, polynomial features, or custom feature transformations to improve model performance.
5. Ensemble Methods: Try other ensemble methods like Gradient Boosting or XGBoost and compare their performance with the Random Forest model.
6. Cross-Validation: Use cross-validation to get a more robust estimate of the model's performance.
7. Handling Imbalanced Data: If the dataset is imbalanced, consider using techniques like oversampling, undersampling, or class weights to handle class imbalances.


# ANSWER 2

In [55]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier


In [56]:
iris = load_iris()
X, y = iris.data, iris.target

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [58]:
Random_Forest_Classifier = RandomForestClassifier(random_state=42)
Logistic_Regression = LogisticRegression(random_state=42)

In [59]:
voting_classifier = VotingClassifier(estimators=[('Random_Forest_Classifier', Random_Forest_Classifier), ('Logistic_Regression', Logistic_Regression )], voting='hard')

In [60]:
voting_classifier.fit(X_train, y_train)

In [61]:
y_pred = voting_classifier.predict(X_test)

In [62]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0
