In [1]:
from pycaret.classification import *
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups

In [2]:
from pycaret.datasets import get_data
dataset = get_data('juice')

print(dataset.shape)

Unnamed: 0,Id,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,1,CH,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,No,0.0,0.0,0.24,1
1,2,CH,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1
2,3,CH,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,No,0.0,0.091398,0.23,1
3,4,MM,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,1
4,5,CH,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,Yes,0.0,0.0,0.0,0


(1070, 19)


In [3]:
py = setup(
    data=dataset,
    target='STORE',
    session_id=123,
    experiment_name=None,
    text_features=["Purchase"],
    ignore_features=["Id"],
    keep_features=["SalePriceCH"],
    preprocess=True,
    imputation_type='simple',
    numeric_iterative_imputer='lightgbm',
    categorical_iterative_imputer='lightgbm',
    max_encoding_ohe=5,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.8,
    bin_numeric_features=["PriceMM"],
    remove_outliers=True,
    outliers_method="lof",
    outliers_threshold=0.5,
    normalize=True,
    feature_selection=True,
    feature_selection_method="classic",
    feature_selection_estimator="rf",
    n_features_to_select=7,
    custom_pipeline=[("pca", PCA(n_components=5))],
    system_log=False,
    memory=False,
)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,STORE
2,Target type,Regression
3,Data shape,"(1070, 19)"
4,Train data shape,"(748, 18)"
5,Test data shape,"(322, 18)"
6,Ordinal features,1
7,Numerical features,15
8,Categorical features,1
9,Date features,0


In [4]:
a = py.pipeline
print(a)
a.fit(py.X, py.y)
X, y = a.transform(py.X, py.y)
print(X)
print(X.shape)

Pipeline(steps=[('text_embedding',
                 TransfomerWrapper(include=['Purchase'], transformer=EmbedTextFeatures())),
                ('ordinal_encoding',
                 TransfomerWrapper(include=['Store7'], transformer=OrdinalEncoder(cols=['Store7'], handle_missing='return_nan', mapping=[{'col': 'Store7', 'mapping': {nan: -1, 'No': 1, 'Yes': 0}}]))),
                ('remove_multicollinearity',
                 TransfomerWrapper(exclu...
                 TransfomerWrapper(transformer=RemoveOutliers(method='lof', threshold=0.5))),
                ('normalize', TransfomerWrapper(transformer=StandardScaler())),
                ('feature_selection',
                 TransfomerWrapper(exclude=['SalePriceCH'], transformer=SelectFromModel(estimator=RandomForestClassifier(), max_features=7, threshold=-inf))),
                ('pca', TransfomerWrapper(transformer=PCA(n_components=5)))])
      Feature 2  Feature 3  Feature 4  Feature 5  Feature 6
0      1.249744  -0.546059  -0.845238

In [5]:
best_model = create_model("lr")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6933,0.9589,0.5904,0.6177,0.6422,0.5972,0.6152
1,0.7733,0.969,0.7062,0.8627,0.7546,0.7017,0.7305
2,0.7733,0.9614,0.6754,0.8513,0.7381,0.7007,0.7212
3,0.76,0.9671,0.6878,0.8188,0.7398,0.6848,0.6969
4,0.7467,0.9425,0.647,0.6984,0.6953,0.6651,0.6925
5,0.7333,0.9703,0.6371,0.7432,0.7139,0.6493,0.6592
6,0.7067,0.9373,0.6142,0.7471,0.6953,0.6161,0.6239
7,0.7067,0.9348,0.6182,0.7994,0.6796,0.6154,0.6373
8,0.7838,0.9536,0.7248,0.8474,0.7497,0.7178,0.7349
9,0.7568,0.9721,0.6741,0.8141,0.7192,0.6813,0.6971


In [6]:
models = compare_models(turbo=False, n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9573,0.991,0.9454,0.969,0.957,0.9442,0.9475,0.038
qda,Quadratic Discriminant Analysis,0.9439,0.9844,0.9276,0.9571,0.9435,0.927,0.9304,0.04
mlp,MLP Classifier,0.8851,0.9794,0.8462,0.9093,0.8765,0.8501,0.8572,0.104
et,Extra Trees Classifier,0.8342,0.9775,0.7687,0.8711,0.8159,0.7826,0.7949,0.053
lightgbm,Light Gradient Boosting Machine,0.8302,0.9669,0.7712,0.8513,0.8176,0.7778,0.7855,0.107
xgboost,Extreme Gradient Boosting,0.8248,0.9577,0.7597,0.8537,0.8077,0.7702,0.7807,0.157
catboost,CatBoost Classifier,0.8115,0.9727,0.7432,0.8177,0.7866,0.753,0.7662,1.469
rf,Random Forest Classifier,0.8115,0.9703,0.7396,0.8427,0.7877,0.7526,0.7651,0.06
dt,Decision Tree Classifier,0.8008,0.876,0.7346,0.8174,0.7871,0.7387,0.7468,0.04
gbc,Gradient Boosting Classifier,0.7994,0.949,0.7308,0.8229,0.7773,0.7372,0.7497,0.081
