In [34]:
import pandas as pd
import seaborn as sns

# Load the California Housing dataset
df = sns.load_dataset('tips').drop(columns=['sex','tip'])
df = df[df['day'].isin(['Fri','Sat'])].reset_index()

In [37]:
df = df.loc[:,['total_bill','day','time','size','smoker']]
df.head()

Unnamed: 0,total_bill,day,time,size,smoker
0,20.65,Sat,Dinner,3,No
1,17.92,Sat,Dinner,2,No
2,20.29,Sat,Dinner,2,No
3,15.77,Sat,Dinner,2,No
4,39.42,Sat,Dinner,4,No


In [38]:
import numpy as np
df.iloc[0:len(df):10,3] = np.nan

In [39]:
df

Unnamed: 0,total_bill,day,time,size,smoker
0,20.65,Sat,Dinner,,No
1,17.92,Sat,Dinner,2.0,No
2,20.29,Sat,Dinner,2.0,No
3,15.77,Sat,Dinner,2.0,No
4,39.42,Sat,Dinner,4.0,No
...,...,...,...,...,...
101,35.83,Sat,Dinner,3.0,No
102,29.03,Sat,Dinner,3.0,No
103,27.18,Sat,Dinner,2.0,Yes
104,22.67,Sat,Dinner,2.0,Yes


In [56]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns='smoker'),
                                                    df['smoker'],
                                                    test_size=0.2)

x_train

Unnamed: 0,total_bill,day,time,size
3,15.77,Sat,Dinner,2.0
34,20.23,Sat,Dinner,2.0
24,11.24,Sat,Dinner,2.0
74,25.89,Sat,Dinner,4.0
35,15.01,Sat,Dinner,2.0
...,...,...,...,...
78,12.90,Sat,Dinner,2.0
20,31.27,Sat,Dinner,
27,13.81,Sat,Dinner,2.0
92,22.12,Sat,Dinner,2.0


In [71]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

impute=SimpleImputer()
x_train_size = impute.fit_transform(X=x_train[['size']])
x_test_size = impute.transform(x_test[['size']])

x_train_size.shape

(84, 1)

In [81]:
ohe = OneHotEncoder(drop='first',sparse_output=False,dtype='int32')
x_train_time = ohe.fit_transform(x_train[['time','day']])
x_test_time = ohe.transform(x_test[['time','day']])

x_train_time.shape

(84, 2)

In [82]:
x_train_bill = x_train.drop(columns=['day', 'time', 'size']).values
x_test_bill = x_test.drop(columns=['day', 'time', 'size']).values

x_train_bill.shape

(84, 1)

In [83]:
train_transformed = np.concatenate((x_train_bill,x_train_time,x_train_size),axis=1)
test_transformed = np.concatenate((x_test_bill,x_test_time,x_test_size),axis=1)

train_transformed.shape

(84, 4)

In [86]:
train_transformed

array([[15.77      ,  0.        ,  1.        ,  2.        ],
       [20.23      ,  0.        ,  1.        ,  2.        ],
       [11.24      ,  0.        ,  1.        ,  2.        ],
       [25.89      ,  0.        ,  1.        ,  4.        ],
       [15.01      ,  0.        ,  1.        ,  2.        ],
       [12.03      ,  0.        ,  0.        ,  2.50632911],
       [35.83      ,  0.        ,  1.        ,  3.        ],
       [10.63      ,  0.        ,  1.        ,  2.        ],
       [14.        ,  0.        ,  1.        ,  2.        ],
       [10.07      ,  0.        ,  1.        ,  2.        ],
       [24.27      ,  0.        ,  1.        ,  2.        ],
       [16.04      ,  0.        ,  1.        ,  3.        ],
       [17.92      ,  0.        ,  1.        ,  2.        ],
       [27.18      ,  0.        ,  1.        ,  2.        ],
       [26.86      ,  0.        ,  1.        ,  2.        ],
       [14.31      ,  0.        ,  1.        ,  2.        ],
       [44.3       ,  0.

In [88]:
df

Unnamed: 0,total_bill,day,time,size,smoker
0,20.65,Sat,Dinner,,No
1,17.92,Sat,Dinner,2.0,No
2,20.29,Sat,Dinner,2.0,No
3,15.77,Sat,Dinner,2.0,No
4,39.42,Sat,Dinner,4.0,No
...,...,...,...,...,...
101,35.83,Sat,Dinner,3.0,No
102,29.03,Sat,Dinner,3.0,No
103,27.18,Sat,Dinner,2.0,Yes
104,22.67,Sat,Dinner,2.0,Yes


In [89]:
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers=[
    ('tfr1',SimpleImputer(),['size']),
    ('tfr2',OneHotEncoder(drop='first',sparse_output=False,dtype='int32'),['time','day'])],remainder='passthrough')

In [91]:
transformer.fit_transform(x_train)


array([[ 2.        ,  0.        ,  1.        , 15.77      ],
       [ 2.        ,  0.        ,  1.        , 20.23      ],
       [ 2.        ,  0.        ,  1.        , 11.24      ],
       [ 4.        ,  0.        ,  1.        , 25.89      ],
       [ 2.        ,  0.        ,  1.        , 15.01      ],
       [ 2.50632911,  0.        ,  0.        , 12.03      ],
       [ 3.        ,  0.        ,  1.        , 35.83      ],
       [ 2.        ,  0.        ,  1.        , 10.63      ],
       [ 2.        ,  0.        ,  1.        , 14.        ],
       [ 2.        ,  0.        ,  1.        , 10.07      ],
       [ 2.        ,  0.        ,  1.        , 24.27      ],
       [ 3.        ,  0.        ,  1.        , 16.04      ],
       [ 2.        ,  0.        ,  1.        , 17.92      ],
       [ 2.        ,  0.        ,  1.        , 27.18      ],
       [ 2.        ,  0.        ,  1.        , 26.86      ],
       [ 2.        ,  0.        ,  1.        , 14.31      ],
       [ 3.        ,  0.