**column Transformer**

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [4]:
df=pd.read_csv("covid_toy.csv")

In [5]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [6]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [7]:
df.shape

(100, 6)

In [8]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(df.drop(
    columns=['has_covid']),df['has_covid'],test_size=0.2)

In [9]:
x_train

Unnamed: 0,age,gender,fever,cough,city
44,20,Male,102.0,Strong,Delhi
53,83,Male,98.0,Mild,Delhi
14,51,Male,104.0,Mild,Bangalore
34,74,Male,102.0,Mild,Mumbai
21,73,Male,98.0,Mild,Bangalore
...,...,...,...,...,...
61,81,Female,98.0,Strong,Mumbai
83,17,Female,104.0,Mild,Kolkata
73,34,Male,98.0,Strong,Kolkata
23,80,Female,98.0,Mild,Delhi


**manually type output**

In [11]:
#adding simple imputer to fever column
si=SimpleImputer(strategy="mean")
x_train_fever=si.fit_transform(x_train[['fever']])

#also the test data
x_test_fever=si.fit_transform(x_test[['fever']])
x_train_fever.shape

(80, 1)

In [12]:
#ordinal encoding--->cough
oe=OrdinalEncoder(categories=[['Mild','Strong']])
x_train_cough=oe.fit_transform(x_train[['cough']])

#also the test data
x_test_cough=oe.fit_transform(x_test[['cough']])
x_train_cough.shape

(80, 1)

In [13]:
#OneHotEncoding--->Gender,city
ohe=OneHotEncoder(drop='first',sparse_output=False)
x_train_gender_city=ohe.fit_transform(x_train[['gender','city']])

#also the test data
x_test_gender_city=ohe.fit_transform(x_test[['gender','city']])
x_train_gender_city.shape

(80, 4)

In [14]:
#extracting age
x_train_age=x_train.drop(columns=
                         ['gender','fever','cough','city']).values

#also the test data
x_test_age=x_test.drop(columns=
                       ['gender','fever','cough','city']).values
                       

In [15]:
x_train_age.shape

(80, 1)

In [16]:
x_train_transformed=np.concatenate((x_train_age,x_train_fever,
                                    x_train_gender_city,
                                    x_train_cough),axis=1)

In [17]:
x_test_transformed=np.concatenate((x_test_age,x_test_fever,x_test_gender_city,x_test_cough),axis=1)

In [18]:
x_train_transformed.shape

(80, 7)

**by the help of column transformer**

In [20]:
from sklearn.compose import ColumnTransformer  # this is how to import ColumnTransformer

transformer = ColumnTransformer(transformers=[
    ('tnf1', SimpleImputer(), ['fever']),  # in a 'fever' column by the help of SI
    # fill missing values by mean, median, mode.
    ('tnf2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),  # by this process we encode our data
    ('tnf3', OneHotEncoder(sparse_output=False, drop='first'), ['gender', 'city'])
], remainder='passthrough')  # remainder = passthrough ==> it means rest all the columns remain same.

In [21]:
transformer.fit_transform(x_train).shape

(80, 7)

In [22]:
transformer.fit_transform(x_test).shape

(20, 7)

In [23]:
#we use column transformer bcz it is more efficient and consumne less memory

**Function transformer**

In [25]:
#The FunctionTransformer is a tool in scikit-learn, a popular Python library
#for machine learning, that allows you to apply a specified function to the input
#data. The FunctionTransformer can be useful for performing custom 
#transformations of input data in a machine learning pipeline.

#The FunctionTransformer takes as input a single function that will be applied to each sample in the data.
#This function can be any Python function that takes a single argument, such as a lambda function or a user-defined function.
#The function should return the transformed sample.

In [26]:
#predefine function example
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
X = np.array([[1, 2], [3, 4]])

# define the transformation function
log_transform = FunctionTransformer(np.log1p)

# apply the transformation to the dataset
X_transformed = log_transform.transform(X)

# view the transformed data
print(X_transformed)

[[0.69314718 1.09861229]
 [1.38629436 1.60943791]]


**Custom Feature Engineering**

In [28]:
#EX.1. Custom Feature Engineering

from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
X = np.array([[1, 2], [3, 4]])

# define a custom feature engineering function
def sakrat_14(X):
    return np.hstack((X, X**2))

# create a FunctionTransformer to apply the custom function
custom_transformer = FunctionTransformer(sakrat_14)

# apply the transformer to the input data
X_transformed = custom_transformer.transform(X)

# view the transformed data
print(X_transformed)

[[ 1  2  1  4]
 [ 3  4  9 16]]


.**Scaling And Normalization**

In [30]:
#Ex.2.Scaling And Normalization
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
X = np.array([[1, 2], [3, 4]])

# define a custom feature engineering function
def my_scaling(X):
    return X/np.max(X)

# create a FunctionTransformer to apply the custom function
custom_transformer = FunctionTransformer(my_scaling)

# apply the transformer to the input data
X_transformed = custom_transformer.transform(X)

# view the transformed data
print(X_transformed)

[[0.25 0.5 ]
 [0.75 1.  ]]


**Data Cleaning**

In [32]:
# 3.Data Cleaning

from sklearn.preprocessing import FunctionTransformer
import numpy as np

#create a dataset
X=np.array([[1, 2], [3, np.nan]])

#define a custom feature engineering function

def my_cleaning(X):
    X[np.isnan(X)] = 0
    return X

# create a FunctionTransformer to apply the custom function
custom_transformer = FunctionTransformer(my_cleaning)


# apply the transformer to the input data
X_transformed = custom_transformer.transform(X)

# view the transformed data
print(X_transformed)

[[1. 2.]
 [3. 0.]]


**Real life use-case of Function transformer**

In [34]:
import numpy as np
import pandas as pd

In [35]:
df=pd.read_csv("placement.csv")

In [36]:
df

Unnamed: 0,cgpa,resume_score,placed
0,8.14,6.52,1
1,6.17,5.17,0
2,8.27,8.86,1
3,6.88,7.27,1
4,7.52,7.30,1
...,...,...,...
95,6.33,6.38,0
96,8.23,7.76,1
97,6.65,7.78,0
98,8.14,5.63,1


In [37]:
df.head(3)

Unnamed: 0,cgpa,resume_score,placed
0,8.14,6.52,1
1,6.17,5.17,0
2,8.27,8.86,1


In [38]:
x=df.drop(columns=['placed'])
y=df['placed']

In [39]:
from sklearn.preprocessing import FunctionTransformer

In [40]:
log_transform = FunctionTransformer(np.log1p)

#apply the transformation to the dataset
x_transformed=log_transform.transform(x)

In [41]:
x_transformed

Unnamed: 0,cgpa,resume_score
0,2.212660,2.017566
1,1.969906,1.819699
2,2.226783,2.288486
3,2.064328,2.112635
4,2.142416,2.116256
...,...,...
95,1.991976,1.998774
96,2.222459,2.170196
97,2.034706,2.172476
98,2.212660,1.891605
