Q1. You are working on a machine learning project where you have a dataset containing numerical and
categorical features. You have identified that some of the features are highly correlated and there are
missing values in some of the columns. You want to build a pipeline that automates the feature
engineering process and handles the missing values.

Q2. Build a pipeline that includes a random forest classifier and a logistic regression classifier, and then
use a voting classifier to combine their predictions. Train the pipeline on the iris dataset and evaluate its
accuracy.

In [48]:
import seaborn as sns


In [49]:
df = sns.load_dataset('tips')

In [50]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [52]:
df['time'].value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [53]:
from sklearn.preprocessing import LabelEncoder

In [54]:
encoder =LabelEncoder()

In [55]:
df['time'] = encoder.fit_transform(df['time'])

In [56]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.5,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4


In [57]:
df['time'].value_counts()

0    176
1     68
Name: time, dtype: int64

In [58]:
# independent and dependent feature
X = df.drop(labels=['time'],axis=1)
y = df['time']

In [59]:
# train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20 , random_state=42)

In [60]:
X_train.shape , X_test.shape

((195, 6), (49, 6))

In [61]:
X_train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,size
228,13.28,2.72,Male,No,Sat,2
208,24.27,2.03,Male,Yes,Sat,2
96,27.28,4.0,Male,Yes,Fri,2
167,31.71,4.5,Male,No,Sun,4
84,15.98,2.03,Male,No,Thur,2


In [62]:
from sklearn.impute import SimpleImputer# handling missing value
from sklearn.preprocessing import OneHotEncoder# handling categorical features
from sklearn.preprocessing import StandardScaler# feature scaling
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [63]:
num_cols = ['total_bill' , 'tip' , 'size']
cat_cols = ['sex' , 'smoker' , 'day']

In [64]:
num_pipeline = Pipeline(
    
    steps = [
        ('imputer',SimpleImputer(strategy = 'mean')),  # handle missing values
        ('scaler' ,StandardScaler())  # feature scaling
    ]


)

cat_pipeline = Pipeline(

    steps = [
        ('imputer',SimpleImputer(strategy = 'most_frequent')), # handling missing values
        ('onehotencoder',OneHotEncoder()), # categorical to numerical
    ]
)

In [65]:
preprocessor = ColumnTransformer([
    ('numerical pipeline' ,  num_pipeline , num_cols),
    ('categorical pipeline', cat_pipeline , cat_cols)
])

In [66]:
X_train = preprocessor.fit_transform(X_train)


In [67]:
X_test = preprocessor.transform(X_test)
X_test

array([[-0.04546101,  0.06468811, -0.61214068,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-1.30860871, -0.76316144, -0.61214068,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.4952348 , -0.76316144,  1.51942062,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.64841289,  1.45379161,  1.51942062,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.82506891, -0.76316144, -0.61214068,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.26608319, -0.76316144, -0.61214068,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.   

In [76]:
from sklearn.ensemble import RandomForestClassifier , VotingClassifier
from sklearn.linear_model import LogisticRegression

In [74]:
models = {
    'random forest' : RandomForestClassifier(),
    'logistic regression' : LogisticRegression()
}

In [70]:
from sklearn.metrics import accuracy_score

In [83]:
def evaluate_model(X_train,X_test,y_train,y_test,models):
    
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        # train model
        model.fit(X_train,y_train)
        
        # predict
        y_test_pred = model.predict(X_test)
        
        
        
        # performance analysis
        test_model_score = accuracy_score(y_test,y_test_pred)
        report[list(models.keys())[i]] = test_model_score
        
    return report

In [84]:
evaluate_model(X_train,X_test,y_train,y_test,models)

{'random forest': 0.9591836734693877, 'logistic regression': 1.0}

In [79]:
def voting(X_train,X_test,y_train,y_test):
    clf1 = LogisticRegression( random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    
    eclf1 = VotingClassifier(
        
        estimators=[
            ('lr', clf1), ('rf', clf2)], voting='hard')
    
    eclf1 = eclf1.fit(X_train, y_train)
    return eclf1.predict(X_test)

In [80]:
voting(X_train,X_test,y_train,y_test)


array([0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1])