In [1]:
from pycaret.classification import *
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups

In [2]:
from pycaret.datasets import get_data
dataset = get_data('juice')

print(dataset.shape)

Unnamed: 0,Id,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,1,CH,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,No,0.0,0.0,0.24,1
1,2,CH,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1
2,3,CH,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,No,0.0,0.091398,0.23,1
3,4,MM,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,1
4,5,CH,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,Yes,0.0,0.0,0.0,0


(1070, 19)


In [3]:
py = setup(
    data=dataset,
    target='STORE',
    session_id=123,
    experiment_name=None,
    text_features=["Purchase"],
    ignore_features=["Id"],
    keep_features=["SalePriceCH"],
    preprocess=True,
    imputation_type='simple',
    numeric_iterative_imputer='lightgbm',
    categorical_iterative_imputer='lightgbm',
    max_encoding_ohe=5,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.8,
    bin_numeric_features=["PriceMM"],
    remove_outliers=True,
    outliers_method="lof",
    outliers_threshold=0.5,
    normalize=True,
    feature_selection=True,
    feature_selection_method="classic",
    feature_selection_estimator="rf",
    n_features_to_select=7,
    custom_pipeline=[("pca", PCA(n_components=5))],
    system_log=False,
    memory=False,
)

<pandas.io.formats.style.Styler object at 0x000001CC67519910>


In [4]:
a = py.pipeline
print(a)
a.fit(py.X, py.y)
X, y = a.transform(py.X, py.y)
print(X)
print(X.shape)

Pipeline(steps=[('text_embedding',
                 TransfomerWrapper(include=['Purchase'], transformer=EmbedTextFeatures())),
                ('ordinal_encoding',
                 TransfomerWrapper(include=['Store7'], transformer=OrdinalEncoder(cols=['Store7'], handle_missing='return_nan', mapping=[{'col': 'Store7', 'mapping': {nan: -1, 'No': 1, 'Yes': 0}}]))),
                ('remove_multicollinearity',
                 TransfomerWrapper(exclu...
                 TransfomerWrapper(transformer=RemoveOutliers(method='lof', threshold=0.5))),
                ('normalize', TransfomerWrapper(transformer=StandardScaler())),
                ('feature_selection',
                 TransfomerWrapper(exclude=['SalePriceCH'], transformer=SelectFromModel(estimator=RandomForestClassifier(), max_features=7, threshold=-inf))),
                ('pca', TransfomerWrapper(transformer=PCA(n_components=5)))])
      Feature 1  Feature 2  Feature 3  Feature 4  Feature 5
0      1.249744  -0.546059  -0.845238

In [5]:
best_model = create_model("lr")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7733,0.976,0.6867,0.7145,0.7176,0.7036,0.7304
1,0.68,0.9415,0.6,0.6287,0.6412,0.5821,0.6055
2,0.8267,0.9883,0.7545,0.8813,0.8014,0.7736,0.7903
3,0.7867,0.9733,0.7057,0.7057,0.7304,0.7211,0.7443
4,0.64,0.9215,0.5388,0.7822,0.6057,0.5285,0.5525
5,0.7333,0.9598,0.642,0.795,0.6928,0.6513,0.6699
6,0.8133,0.9902,0.7506,0.8669,0.7754,0.7567,0.7737
7,0.72,0.9557,0.6344,0.789,0.6866,0.6342,0.6512
8,0.6892,0.9704,0.5869,0.7401,0.6487,0.5937,0.6115
9,0.8108,0.9726,0.7495,0.8553,0.8088,0.7526,0.7613


In [6]:
models = compare_models(turbo=False, n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9946,0.9999,0.9925,0.9951,0.9946,0.9931,0.9932,0.076
qda,Quadratic Discriminant Analysis,0.9558,0.9878,0.9412,0.9683,0.952,0.9427,0.9465,0.066
mlp,MLP Classifier,0.8916,0.9872,0.8569,0.9149,0.8866,0.8592,0.8658,0.126
et,Extra Trees Classifier,0.8408,0.9842,0.7789,0.8857,0.8186,0.7923,0.8064,0.077
catboost,CatBoost Classifier,0.8315,0.9793,0.7685,0.8669,0.8118,0.78,0.7921,1.501
xgboost,Extreme Gradient Boosting,0.8235,0.9643,0.7669,0.8408,0.8034,0.7703,0.7812,0.187
rf,Random Forest Classifier,0.8143,0.977,0.7482,0.8339,0.7856,0.7575,0.7733,0.079
gbc,Gradient Boosting Classifier,0.813,0.9554,0.7521,0.8445,0.7938,0.7563,0.7686,0.126
lightgbm,Light Gradient Boosting Machine,0.8116,0.9703,0.7518,0.8338,0.7879,0.7549,0.7675,0.135
dt,Decision Tree Classifier,0.7741,0.8606,0.7197,0.7868,0.7584,0.7071,0.7163,0.049
