In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy.stats import binomtest,chisquare,ttest_1samp,chi2_contingency,ttest_ind,f_oneway,pearsonr
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder,MinMaxScaler,StandardScaler,PolynomialFeatures,RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest,f_classif,f_regression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.pipeline  import Pipeline , make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold


In [2]:
Pipeline1 = make_pipeline(SimpleImputer(strategy='mean'),StandardScaler())
Pipeline1

0,1,2
,steps,"[('simpleimputer', ...), ('standardscaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [3]:
Pipeline2 = make_pipeline(KNNImputer(n_neighbors=5),MinMaxScaler())
Pipeline2

0,1,2
,steps,"[('knnimputer', ...), ('minmaxscaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [4]:
Pipeline3 = make_pipeline(SimpleImputer(strategy='variance'),RobustScaler(),SelectKBest(score_func=f_regression,k=3))
Pipeline3

0,1,2
,steps,"[('simpleimputer', ...), ('robustscaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'variance'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,score_func,<function f_r...x7d96c0963420>
,k,3


In [5]:

Pipeline5 = make_pipeline(SimpleImputer(strategy="mean"),VarianceThreshold(),StandardScaler())
Pipeline5

0,1,2
,steps,"[('simpleimputer', ...), ('variancethreshold', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,threshold,0.0

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [6]:
categories_cols = ['romaric','albert','thomas']
numerical_cols = [1,25,36]


transforsN = Pipeline(steps=[
    ("Data",OneHotEncoder())
])

transformation = ColumnTransformer(transformers=[
    ("CAT",transforsN,categories_cols),
    ("NUM",StandardScaler(),numerical_cols)
])

transformation

0,1,2
,transformers,"[('CAT', ...), ('NUM', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [7]:
transforsN2 = Pipeline(steps=[
    ("encoder",OrdinalEncoder())
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  
    ('scaler', MinMaxScaler())  
])
transformations = ColumnTransformer(transformers=[
    ("CAT",transforsN2,categories_cols),
    ("Num",numerical_transformer,numerical_cols),
])

transformations

0,1,2
,transformers,"[('CAT', ...), ('Num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [8]:
categoriepip3 = Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("encoder",OneHotEncoder())
])

numeriqueP3 = Pipeline(steps=[
    ("imputer",KNNImputer()),
    ("Outlier",RobustScaler())
])

transformation3 = ColumnTransformer(transformers=[
    ("CAT",categoriepip3,categories_cols),
    ("NUM",numeriqueP3,numerical_cols),
])

pipeline5 = make_pipeline(transformation3,SelectKBest())
pipeline5

0,1,2
,steps,"[('columntransformer', ...), ('selectkbest', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('CAT', ...), ('NUM', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,score_func,<function f_c...x7d96c0962fc0>
,k,10


In [9]:
from sklearn.preprocessing import PowerTransformer
categoriepip3 = Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("encoder",OrdinalEncoder())
])

numeriqueP3 = Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("Outlier",PowerTransformer())
])

transformation3 = ColumnTransformer(transformers=[
    ("CAT",categoriepip3,categories_cols),
    ("NUM",numeriqueP3,numerical_cols),
])

pipeline6 = make_pipeline(transformation3,VarianceThreshold())
pipeline6

0,1,2
,steps,"[('columntransformer', ...), ('variancethreshold', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('CAT', ...), ('NUM', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,threshold,0.0


In [10]:
from sklearn.preprocessing import PowerTransformer
categoriepip3 = Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("encoder",OrdinalEncoder()),
    ("selection",SelectKBest(score_func=f_classif,k=5))
])

numeriqueP3 = Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("Outlier",StandardScaler())
])

transformation3 = ColumnTransformer(transformers=[
    ("CAT",categoriepip3,categories_cols),
    ("NUM",numeriqueP3,numerical_cols),
])

transformation3

0,1,2
,transformers,"[('CAT', ...), ('NUM', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,score_func,<function f_c...x7d96c0962fc0>
,k,5

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [11]:
categoriepip3 = Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("encoder",OrdinalEncoder()),
    ("selection",SelectKBest(score_func=f_classif,k=5))
])

numeriqueP3 = Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("iqr",SelectKBest()),
    ("Outlier",StandardScaler())
])

transformation3 = ColumnTransformer(transformers=[
    ("CAT",categoriepip3,categories_cols),
    ("NUM",numeriqueP3,numerical_cols),
])

pipline7 = make_pipeline(transformation3,PolynomialFeatures(),RobustScaler())
pipline7

0,1,2
,steps,"[('columntransformer', ...), ('polynomialfeatures', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('CAT', ...), ('NUM', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,score_func,<function f_c...x7d96c0962fc0>
,k,5

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,score_func,<function f_c...x7d96c0962fc0>
,k,10

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,degree,2
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False


In [12]:
categoriepip3 = Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("encoder",OneHotEncoder())
])

numeriqueP3 = Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("iqr",SelectKBest()),
    ("Outlier",StandardScaler()),
    ("poly",PolynomialFeatures()),
    ("select",SelectKBest())
])

transformation3 = ColumnTransformer(transformers=[
    ("CAT",categoriepip3,categories_cols),
    ("NUM",numeriqueP3,numerical_cols),
])

pipline7 = make_pipeline(transformation3,RobustScaler())
pipline7

0,1,2
,steps,"[('columntransformer', ...), ('robustscaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('CAT', ...), ('NUM', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,score_func,<function f_c...x7d96c0962fc0>
,k,10

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,degree,2
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,score_func,<function f_c...x7d96c0962fc0>
,k,10

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False


In [13]:
categories_cols = ['romaric','albert','thomas']
numerical_cols = [1,25,36]
continue_cols = [2.3,25.6,36.2]

niveau5=Pipeline(steps=[
    ("encoder",OneHotEncoder())
])

niveau5_2=Pipeline(steps=[
    ("encoder",OrdinalEncoder())
])

niveau5_3 =Pipeline(steps=[
    ("encoder",StandardScaler())
])

column_transformer2 =ColumnTransformer(transformers=[
    ('CAT_NOM',niveau5,categories_cols),
    ('CAT_ORD',niveau5_2,numerical_cols),
    ('NUM',niveau5_3,continue_cols)
])

pipeline6 = make_pipeline(column_transformer2,SelectKBest())
pipeline6

0,1,2
,steps,"[('columntransformer', ...), ('selectkbest', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('CAT_NOM', ...), ('CAT_ORD', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,score_func,<function f_c...x7d96c0962fc0>
,k,10


In [15]:
categories_cols = ['romaric','albert','thomas']
numerical_cols = [1,25,36]
continue_cols = [2.3,25.6,36.2]

niveau5=Pipeline(steps=[
    ("encoder",OneHotEncoder()),
    ("imputer",SimpleImputer())
])

niveau5_2=Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("encoder",OrdinalEncoder())
])

niveau5_3 =Pipeline(steps=[
    ("imputer",KNNImputer()),
    ("encoder",StandardScaler())
])

column_transformer2 =ColumnTransformer(transformers=[
    ('CAT_NOM',niveau5,categories_cols),
    ('CAT_ORD',niveau5_2,numerical_cols),
    ('NUM',niveau5_3,continue_cols)
])

pipeline6 = make_pipeline(column_transformer2,SelectKBest())
pipeline6

0,1,2
,steps,"[('columntransformer', ...), ('selectkbest', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('CAT_NOM', ...), ('CAT_ORD', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,score_func,<function f_c...x7d96c0962fc0>
,k,10


In [16]:
categories_cols = ['romaric','albert','thomas']
numerical_cols = [1,25,36]
continue_cols = [2.3,25.6,36.2]

niveau5=Pipeline(steps=[
    ("encoder",OneHotEncoder()),
    ("imputer",SimpleImputer())
])

niveau5_2=Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("encoder",OrdinalEncoder()),
    ('selection',SelectKBest())
])

niveau5_3 =Pipeline(steps=[
    ("imputer",KNNImputer()),
    ("encoder",StandardScaler()),
    ("selection",PolynomialFeatures())
])

column_transformer2 =ColumnTransformer(transformers=[
    ('CAT_NOM',niveau5,categories_cols),
    ('CAT_ORD',niveau5_2,numerical_cols),
    ('NUM',niveau5_3,continue_cols)
])

pipeline6 = make_pipeline(column_transformer2,SelectKBest())
pipeline6

0,1,2
,steps,"[('columntransformer', ...), ('selectkbest', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('CAT_NOM', ...), ('CAT_ORD', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,score_func,<function f_c...x7d96c0962fc0>
,k,10

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,degree,2
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,score_func,<function f_c...x7d96c0962fc0>
,k,10


In [19]:
categories_cols = ['romaric','albert','thomas']
numerical_cols = [1,25,36]
continue_cols = [2.3,25.6,36.2]

niveau5=Pipeline(steps=[
    ("encoder",OneHotEncoder()),
    ("imputer",SimpleImputer())
])

niveau5_2=Pipeline(steps=[
    ("imputer",SimpleImputer()),
    ("encoder",OrdinalEncoder())
])

niveau5_3 =Pipeline(steps=[
    ("imputer",KNNImputer()),
    ("encoder",StandardScaler())
])

niveau5_4 =Pipeline(steps=[
    ("imputer",KNNImputer()),
    ("encoder",StandardScaler())
])

column_transformer2 =ColumnTransformer(transformers=[
    ('CAT_NOM',niveau5,categories_cols),
    ('CAT_ORD',niveau5_2,numerical_cols),
    ('CAT_BIN',niveau5_3,continue_cols),
    ('NUM',niveau5_4,continue_cols)
    
])

pipeline6 = make_pipeline(column_transformer2,SelectKBest(),PolynomialFeatures(),transformation)
pipeline6

0,1,2
,steps,"[('columntransformer-1', ...), ('selectkbest', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('CAT_NOM', ...), ('CAT_ORD', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,score_func,<function f_c...x7d96c0962fc0>
,k,10

0,1,2
,degree,2
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,transformers,"[('CAT', ...), ('NUM', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True
