In [1]:
import pandas as pd

In [2]:
df_churn = pd.read_csv(r'/home/ric/Téléchargements/Churn_Modelling.csv')
df_churn.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [3]:
df_churn.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [30]:
def encode_ohe(df,numeric_cols,categorical_cols):
    """
    fonction qui permet d encoder les variables categorielles et de standardiser les variables numerique
    a travers un pipeline.    
    
    Parameters
    ----------
    df : TYPE dataframe
    numeric_cols : liste
        liste de nom de colonne de type numerique
    categorical_cols : liste
        liste de nom de colonne de type categorielle

    Returns
    -------
    ohe_encode_df : dataframe
        dataframe valeurs numeriques standardisees et colonnes categorielles encodees a chaud
        
    """
    from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    
    numeric_transformer = Pipeline(steps=[
        ('imputer',SimpleImputer(strategy='constant',fill_value=0)),
        ('scaler',StandardScaler())
        ])
    
    categ_transformer = Pipeline(steps=[
        ('imputer',SimpleImputer(strategy='constant',fill_value='inconnue')),
        ('ohe',OneHotEncoder(handle_unknown='ignore'))
        ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num',numeric_transformer,numeric_cols),
            ('cat',categ_transformer,categorical_cols)
        ])
    
    preprocessor.fit(df)  # fit the ColumnTransformer object on the input data
    
    ohe_columns = preprocessor.named_transformers_['cat']['ohe'].get_feature_names(categorical_cols)
    new_columns = numeric_cols + list(ohe_columns)
    
    ohe_encode_df = pd.DataFrame(preprocessor.transform(df), columns=new_columns)
    return ohe_encode_df 


In [31]:
X = df_churn.iloc[:,3:13]
y =  df_churn.iloc[:,13]
l_cat_col = ["Geography","Gender"]
l_num_col = ["CreditScore","Age","Tenure","Balance","NumOfProducts","HasCrCard","IsActiveMember","EstimatedSalary"]

In [32]:
print(round(X[l_num_col].describe(),2))

       CreditScore       Age    Tenure    Balance  NumOfProducts  HasCrCard  \
count     10000.00  10000.00  10000.00   10000.00       10000.00   10000.00   
mean        650.53     38.92      5.01   76485.89           1.53       0.71   
std          96.65     10.49      2.89   62397.41           0.58       0.46   
min         350.00     18.00      0.00       0.00           1.00       0.00   
25%         584.00     32.00      3.00       0.00           1.00       0.00   
50%         652.00     37.00      5.00   97198.54           1.00       1.00   
75%         718.00     44.00      7.00  127644.24           2.00       1.00   
max         850.00     92.00     10.00  250898.09           4.00       1.00   

       IsActiveMember  EstimatedSalary  
count        10000.00         10000.00  
mean             0.52        100090.24  
std              0.50         57510.49  
min              0.00            11.58  
25%              0.00         51002.11  
50%              1.00        100193.92  
7

In [33]:
df_new = encode_ohe(X, l_num_col, l_cat_col)
df_new["Exited"] = y
print(df_new)

      CreditScore       Age    Tenure   Balance  NumOfProducts  HasCrCard  \
0       -0.326221  0.293517 -1.041760 -1.225848      -0.911583   0.646092   
1       -0.440036  0.198164 -1.387538  0.117350      -0.911583  -1.547768   
2       -1.536794  0.293517  1.032908  1.333053       2.527057   0.646092   
3        0.501521  0.007457 -1.387538 -1.225848       0.807737  -1.547768   
4        2.063884  0.388871 -1.041760  0.785728      -0.911583   0.646092   
...           ...       ...       ...       ...            ...        ...   
9995     1.246488  0.007457 -0.004426 -1.225848       0.807737   0.646092   
9996    -1.391939 -0.373958  1.724464 -0.306379      -0.911583   0.646092   
9997     0.604988 -0.278604  0.687130 -1.225848      -0.911583  -1.547768   
9998     1.256835  0.293517 -0.695982 -0.022608       0.807737   0.646092   
9999     1.463771 -1.041433 -0.350204  0.859965      -0.911583   0.646092   

      IsActiveMember  EstimatedSalary  Geography_France  Geography_Germany 

In [34]:
df_new.to_csv("df_churn_transformed.csv")

Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 8.5 MB/s eta 0:00:01     |█████████████████▋              | 5.4 MB 8.5 MB/s eta 0:00:01
[?25hCollecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[K     |████████████████████████████████| 297 kB 13.5 MB/s eta 0:00:01
Installing collected packages: joblib, scikit-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.0.1
    Uninstalling joblib-1.0.1:
      Successfully uninstalled joblib-1.0.1
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.1
    Uninstalling scikit-learn-0.24.1:
      Successfully uninstalled scikit-learn-0.24.1
Successfully installed joblib-1.2.0 scikit-learn-1.2.2
