In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

# import iris
from sklearn.datasets import load_iris

In [3]:
# read iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["species"] = iris.target

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
df.species.value_counts()

0    50
1    50
2    50
Name: species, dtype: int64

0 = Iris-versicolor
1 = Iris-setosa
2 = Iris-virginica

In [6]:
# dataset splitting
x = df.drop(columns="species")
y = df["species"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

In [8]:
gsp.logreg_params

{'algo__fit_intercept': [True, False],
 'algo__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])}

In [9]:
# training

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(scaling = 'standard', transform='yeo-johnson'), x_train.columns),
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', LogisticRegression(multi_class = 'ovr', solver = 'lbfgs', n_jobs = -1, random_state = 42))
])

model = GridSearchCV(pipeline, gsp.logreg_params, cv=3, n_jobs=-1, verbose=1)
model.fit(x_train, y_train)

print(model.best_params_)
print(model.score(x_train, y_train), model.best_score_, model.score(x_test, y_test))

  warn("Transformer has default standardization, so the scaling argument is neglected")


Fitting 3 folds for each of 14 candidates, totalling 42 fits
{'algo__C': 10.0, 'algo__fit_intercept': True}
0.9666666666666667 0.9333333333333332 0.9333333333333333


MULTINOMIAL / softmax

In [11]:
# training

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), x_train.columns),
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', n_jobs = -1, random_state = 42))
])

model = GridSearchCV(pipeline, gsp.logreg_params, cv=3, n_jobs=-1, verbose=1)
model.fit(x_train, y_train)

print(model.best_params_)
print(model.score(x_train, y_train), model.best_score_, model.score(x_test, y_test))

Fitting 3 folds for each of 14 candidates, totalling 42 fits
{'algo__C': 10.0, 'algo__fit_intercept': True}
0.975 0.9666666666666667 1.0


class_weigth untuk mengatasi data yang imbalance pada logistik regresi parameter

In [12]:
# contoh class-weith
{
    "iris-versicolor": 0.33,
    "iris-setosa" : 0.33,
    "iris-virginica" : 0.34
}

{'iris-versicolor': 0.33, 'iris-setosa': 0.33, 'iris-virginica': 0.34}

contoh pakai poly

In [13]:
# training

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(poly=2), x_train.columns),
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', n_jobs = -1, random_state = 42))
])

model = GridSearchCV(pipeline, gsp.logreg_poly_params, cv=3, n_jobs=-1, verbose=1)
model.fit(x_train, y_train)

print(model.best_params_)
print(model.score(x_train, y_train), model.best_score_, model.score(x_test, y_test))

Fitting 3 folds for each of 84 candidates, totalling 252 fits
{'algo__C': 0.01, 'algo__fit_intercept': False, 'prep__numeric__poly__degree': 2, 'prep__numeric__poly__interaction_only': False}
0.9833333333333333 0.975 0.9666666666666667
