In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy.stats import binomtest,chisquare,ttest_1samp,chi2_contingency,ttest_ind,f_oneway,pearsonr
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder,MinMaxScaler,StandardScaler,PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest,f_classif,f_regression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.pipeline  import Pipeline , make_pipeline
from sklearn.compose import ColumnTransformer


In [2]:
df = sns.load_dataset("diamonds")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
train_set ,test_set = train_test_split(df,test_size=0.2,random_state=0)

In [4]:
Pipeline=make_pipeline(OrdinalEncoder(),MinMaxScaler())


In [5]:
Pipeline

0,1,2
,steps,"[('ordinalencoder', ...), ('minmaxscaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


pipeline = Pipeline(
    steps=[
        ("encoder",OrdinalEncoder()),
        ("scaler",MinMaxscaler())
        ]
)

In [6]:
Pipeline.fit_transform(train_set)

array([[0.53358209, 0.5       , 0.5       , ..., 0.69652651, 0.70879121,
        0.67486339],
       [0.05223881, 0.5       , 0.5       , ..., 0.13162706, 0.13003663,
        0.15300546],
       [0.07462687, 0.5       , 0.16666667, ..., 0.17915905, 0.19047619,
        0.19672131],
       ...,
       [0.04477612, 0.75      , 0.83333333, ..., 0.10968921, 0.12087912,
        0.1284153 ],
       [0.07835821, 0.5       , 0.5       , ..., 0.19378428, 0.19413919,
        0.19672131],
       [0.26492537, 0.5       , 0.33333333, ..., 0.45521024, 0.45054945,
        0.43442623]], shape=(43152, 10))

In [7]:
categorical_cols = ['cut','color','clarity']


transform=ColumnTransformer(transformers=[("Encoder",OrdinalEncoder(),categorical_cols)],remainder="passthrough")
transform

0,1,2
,transformers,"[('Encoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [8]:
Pipeline = make_pipeline(transform,MinMaxScaler())
Pipeline.fit_transform(train_set)

array([[0.5       , 0.5       , 0.57142857, ..., 0.70391061, 0.12903226,
        0.14716981],
       [0.5       , 0.5       , 0.71428571, ..., 0.41620112, 0.075382  ,
        0.08710692],
       [0.5       , 0.16666667, 1.        , ..., 0.44040968, 0.08098472,
        0.09213836],
       ...,
       [0.75      , 0.83333333, 0.57142857, ..., 0.40502793, 0.07453311,
        0.08427673],
       [0.5       , 0.5       , 0.14285714, ..., 0.44785847, 0.08132428,
        0.09213836],
       [0.5       , 0.33333333, 0.42857143, ..., 0.58100559, 0.10509338,
        0.11949686]], shape=(43152, 10))

# Exercice 

In [9]:
data = sns.load_dataset("diamonds")
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [10]:
niveau_1 = make_pipeline(PolynomialFeatures(),SelectKBest(),MinMaxScaler())
niveau_1

0,1,2
,steps,"[('polynomialfeatures', ...), ('selectkbest', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,score_func,<function f_c...x7839161ce700>
,k,10

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [11]:
categorical_cols = ['cut','color','clarity']

transformations = ColumnTransformer(
    transformers=[("categories",OneHotEncoder(),categorical_cols)],remainder="passthrough")

transformations

0,1,2
,transformers,"[('categories', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [None]:
niveau_2= make_pipeline(transformations,PolynomialFeatures(),MinMaxScaler())
niveau_2

0,1,2
,steps,"[('columntransformer', ...), ('polynomialfeatures', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categories', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,degree,2
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [20]:
from sklearn.pipeline  import Pipeline , make_pipeline

columntransformer = ColumnTransformer(transformers=[
    ('Categorical Col Transformer', Pipeline(steps=[
        ('onehot', OneHotEncoder()),
        ('select', SelectKBest(f_regression, k=2))
    ]), categorical_cols)
], remainder='passthrough')

# Pipeline complet
pipeline3 = Pipeline(steps=[
    ('Mon ColumnTransformer', columntransformer),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', MinMaxScaler())
])
pipeline3

0,1,2
,steps,"[('Mon ColumnTransformer', ...), ('poly', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('Categorical Col Transformer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,score_func,<function f_r...x7839161ceb60>
,k,2

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.preprocessing import PolynomialFeatures

# Définition des colonnes
categorical_col = ["Category"]
continuous_col = ["height", "weight"]
discret_col = ["age", "floor"]

# Transformation des variables catégorielles avec sélection de caractéristiques
cat_tranformer = Pipeline(steps=[
    ('encoder', OneHotEncoder()),
    ('selector', SelectKBest(score_func=mutual_info_regression, k=2))
])

# Définition du ColumnTransformer
column_transformer = ColumnTransformer(transformers=[
    ('Encode_Select', cat_tranformer, categorical_col),
    ('Standardizer', StandardScaler(), continuous_col),
    ('MinMax', MinMaxScaler(), discret_col)
])

# Création du pipeline final
pipeline4 = Pipeline(steps=[
    ('Encode_Select_Scale', column_transformer),
    ('Feature Engineering', PolynomialFeatures()),
    ('MinMax', MinMaxScaler())
])

pipeline4

0,1,2
,steps,"[('Encode_Select_Scale', ...), ('Feature Engineering', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('Encode_Select', ...), ('Standardizer', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,score_func,<function mut...x7839161cc4a0>
,k,2

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False
