In [13]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [3]:
penguins = pd.read_csv('penguins.csv')

cat_var = 'species'

X = penguins.drop(columns=[cat_var])
y = penguins[cat_var]

In [4]:
num_cols = X.select_dtypes(include='number').columns
cat_cols = X.select_dtypes(exclude='number').columns

In [5]:
num_pipe = Pipeline([
    ('impute', SimpleImputer()),
    ('scale', StandardScaler())
])

In [6]:
cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])


In [7]:
preprocess = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])


In [8]:
model = Pipeline([
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=100, C=1000))
])
model

0,1,2
,steps,"[('prep', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1000
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [9]:
acc_scores = cross_val_score(model, X, y)
print('Penguins accuracy:', acc_scores.mean())

fit = model.fit(X, y)

Penguins accuracy: 0.994160272804774


In [10]:
feature_names = model.named_steps["prep"].get_feature_names_out()
print(feature_names)
print(model['clf'].classes_)
print(model['clf'].coef_)

['num__bill_length_mm' 'num__bill_depth_mm' 'num__flipper_length_mm'
 'num__body_mass_g' 'cat__island_biscoe' 'cat__island_dream'
 'cat__island_torgersen' 'cat__sex_Female' 'cat__sex_Male'
 'cat__sex_unknown']
['Adelie' 'Chinstrap' 'Gentoo']
[[-11.69855574   6.26040551  -1.3043145    1.25038192  -1.12185566
   -2.26041983   4.17720584  -0.86066702   1.99325273  -0.33765537]
 [  9.91846562  -2.9026246   -1.28056746  -4.08891187  -2.82141179
    3.94921743  -1.95174382   0.2795057   -1.41118668   0.3077428 ]
 [  1.78009012  -3.35778091   2.58488196   2.83852995   3.94326745
   -1.6887976   -2.22546202   0.58116132  -0.58206605   0.02991256]]


In [11]:
coefs = pd.Series(model.named_steps['clf'].coef_[0], index=feature_names).sort_values(key=abs, ascending=False)
coefs.head()

num__bill_length_mm     -11.698556
num__bill_depth_mm        6.260406
cat__island_torgersen     4.177206
cat__island_dream        -2.260420
cat__sex_Male             1.993253
dtype: float64

In [16]:
param_grid = {'C':np.logspace(-2, 3, 100)}
grid = Pipeline([
    ('prep', preprocess),
    ('clf', GridSearchCV(LogisticRegression(max_iter=100), param_grid=param_grid, cv=5, scoring='accuracy', refit=True))
])

grid.fit(X, y)
print(grid.named_steps['clf'].best_params_, grid.named_steps['clf'].best_score_)
best = grid.named_steps['clf'].best_estimator_


{'C': np.float64(12.045035402587823)} 0.9971014492753623
