In [1]:
from pycaret.classification import *
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups

In [2]:
from pycaret.datasets import get_data
dataset = get_data('juice')

print(dataset.shape)

Unnamed: 0,Id,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,1,CH,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,No,0.0,0.0,0.24,1
1,2,CH,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1
2,3,CH,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,No,0.0,0.091398,0.23,1
3,4,MM,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,1
4,5,CH,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,Yes,0.0,0.0,0.0,0


(1070, 19)


In [3]:
py = setup(
    data=dataset,
    target='STORE',
    session_id=123,
    experiment_name=None,
    text_features=["Purchase"],
    ignore_features=["Id"],
    keep_features=["SalePriceCH"],
    preprocess=True,
    imputation_type='simple',
    numeric_iterative_imputer='lightgbm',
    categorical_iterative_imputer='lightgbm',
    max_encoding_ohe=5,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.8,
    bin_numeric_features=["PriceMM"],
    remove_outliers=True,
    outliers_method="lof",
    outliers_threshold=0.5,
    normalize=True,
    feature_selection=True,
    feature_selection_method="classic",
    feature_selection_estimator="rf",
    n_features_to_select=7,
    custom_pipeline=[("pca", PCA(n_components=5))],
    system_log=False,
    memory=False,
)

<pandas.io.formats.style.Styler object at 0x000002192A67D2B0>


In [4]:
a = py.pipeline
print(a)
a.fit(py.X, py.y)
X, y = a.transform(py.X, py.y)
print(X)
print(X.shape)

Pipeline(steps=[('text_embedding',
                 TransfomerWrapper(include=['Purchase'], transformer=EmbedTextFeatures())),
                ('ordinal_encoding',
                 TransfomerWrapper(include=['Store7'], transformer=OrdinalEncoder(cols=['Store7'], handle_missing='return_nan', mapping=[{'col': 'Store7', 'mapping': {nan: -1, 'No': 1, 'Yes': 0}}]))),
                ('remove_multicollinearity',
                 TransfomerWrapper(exclu...
                 TransfomerWrapper(transformer=RemoveOutliers(method='lof', threshold=0.5))),
                ('normalize', TransfomerWrapper(transformer=StandardScaler())),
                ('feature_selection',
                 TransfomerWrapper(exclude=['SalePriceCH'], transformer=SelectFromModel(estimator=RandomForestClassifier(), max_features=7, threshold=-inf))),
                ('pca', TransfomerWrapper(transformer=PCA(n_components=5)))])
      Feature 2  Feature 3  Feature 4  Feature 5  Feature 6
0      1.249744  -0.546059  -0.845238

In [5]:
best_model = create_model("lr")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6533,0.9292,0.5368,0.6017,0.6011,0.5429,0.579
1,0.7733,0.969,0.7062,0.8627,0.7546,0.7017,0.7305
2,0.7733,0.9614,0.6754,0.8513,0.7381,0.7007,0.7212
3,0.76,0.9671,0.6878,0.8188,0.7398,0.6848,0.6969
4,0.72,0.9405,0.6273,0.6531,0.6742,0.6312,0.6525
5,0.7333,0.9703,0.6371,0.7432,0.7139,0.6493,0.6592
6,0.7067,0.9373,0.6142,0.7471,0.6953,0.6161,0.6239
7,0.7067,0.9348,0.6182,0.7994,0.6796,0.6154,0.6373
8,0.7838,0.9536,0.7248,0.8474,0.7497,0.7178,0.7349
9,0.7568,0.9721,0.6741,0.8141,0.7192,0.6813,0.6971


In [6]:
models = compare_models(turbo=False, n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.9439,0.9844,0.9276,0.9571,0.9435,0.927,0.9304,0.043
lda,Linear Discriminant Analysis,0.92,0.9865,0.9009,0.9276,0.9156,0.8954,0.8996,0.043
mlp,MLP Classifier,0.8851,0.9781,0.8441,0.9096,0.8769,0.8499,0.857,0.114
lightgbm,Light Gradient Boosting Machine,0.8395,0.9669,0.7808,0.8575,0.8276,0.7898,0.7971,0.086
et,Extra Trees Classifier,0.8262,0.978,0.7571,0.8647,0.8048,0.772,0.7856,0.057
xgboost,Extreme Gradient Boosting,0.8248,0.9589,0.7608,0.8482,0.8075,0.7704,0.7802,0.144
catboost,CatBoost Classifier,0.8128,0.9706,0.7486,0.8331,0.7904,0.7552,0.7677,1.302
rf,Random Forest Classifier,0.8102,0.9706,0.7374,0.8429,0.7854,0.7509,0.7642,0.059
dt,Decision Tree Classifier,0.8088,0.8812,0.7435,0.825,0.7943,0.7494,0.7572,0.043
gbc,Gradient Boosting Classifier,0.7981,0.9494,0.7299,0.8149,0.7762,0.7357,0.7479,0.081
