In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


ML Supervisado, clasificación

In [3]:
df.shape

(344, 7)

In [4]:
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

Funciones SKLearn

- make_pipeline
- StandarScaler
- MinMaxScaler
- LabelEncoder
- OneHotEncoder
- SimpleImputer

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [6]:
df.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

In [7]:
X = df.drop(columns='species')
y = df['species']

In [8]:
numeric_features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
categorical_features = ['island', 'sex']

In [9]:
numeric_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
    
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
model = make_pipeline(preprocessor, LogisticRegression())

In [17]:
X_train

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
31,Dream,37.2,18.1,178.0,3900.0,Male
245,Biscoe,46.1,15.1,215.0,5100.0,Male
277,Biscoe,45.5,15.0,220.0,5000.0,Male
165,Dream,52.0,18.1,201.0,4050.0,Male
250,Biscoe,47.3,15.3,222.0,5250.0,Male
...,...,...,...,...,...,...
188,Dream,47.6,18.3,195.0,3850.0,Female
71,Torgersen,39.7,18.4,190.0,3900.0,Male
106,Biscoe,38.6,17.2,199.0,3750.0,Female
270,Biscoe,46.6,14.2,210.0,4850.0,Female


In [24]:
x_trasformed = preprocessor.fit_transform(X_train)
x_trasformed

array([[-1.22489347,  0.51460888, -1.63466016, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.38570856, -1.0044301 ,  0.96802255, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.27712864, -1.05506473,  1.31973643, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.97154034,  0.05889719, -0.15746186, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.47619182, -1.46014179,  0.61630867, ...,  0.        ,
         1.        ,  0.        ],
       [-1.13441021, -0.5487184 , -1.28294628, ...,  0.        ,
         1.        ,  0.        ]])

In [28]:
a = X.sample(15)

In [32]:
b = a[a['island']!='Dream']
b = b[['island','sex']]

In [33]:
a = a[['island','sex']]

In [43]:
b.island.unique()

array(['Torgersen', 'Biscoe'], dtype=object)

In [41]:
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(b)
ohe.transform(b).toarray()

array([[0., 1., 0., 1.],
       [1., 0., 0., 1.],
       [1., 0., 0., 1.],
       [0., 1., 1., 0.],
       [0., 1., 0., 1.],
       [1., 0., 1., 0.],
       [1., 0., 0., 1.],
       [0., 1., 1., 0.]])

In [42]:
ohe.transform(a).toarray()

array([[0., 1., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 1.],
       [0., 1., 1., 0.],
       [0., 1., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 1., 0.],
       [0., 0., 0., 1.]])