## Prepare train_data and test_data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from hypergbm import make_experiment
from hypernets.tabular.metrics import metric_to_scoring

In [2]:
train_data = pd.read_csv('datasets/Magic/train.csv.gz')
test_data = pd.read_csv('datasets/Magic/test.csv.gz')
X_train = train_data.copy()
y_train = X_train.pop('Class')
X_test = test_data.copy()
y_test = X_test.pop('Class')

# Use Pseudo Labeling
Pseudo labeling is a semi-supervised learning technique, instead of manually labeling the unlabelled data, we give approximate labels on the basis of the labelled data. Pseudo-labeling can sometimes improve the generalization capabilities of the model.

**Options:**

* pseudo_labeling : bool, (default=False)
    Whether to enable pseudo labeling. Pseudo labeling is a semi-supervised learning technique, instead of manually
    labeling the unlabelled data, we give approximate labels on the basis of the labelled data. Pseudo-labeling can
    sometimes improve the generalization capabilities of the model.
* pseudo_labeling_strategy : str, (default='threshold')
    Strategy to sample pseudo labeling data(*threshold*, *number* or *quantile*).
* pseudo_labeling_proba_threshold : float, (default=0.8)
    Confidence threshold of pseudo-label samples. Only valid when *pseudo_labeling_strategy* is 'threshold'.
* pseudo_labeling_proba_quantile:
    Confidence quantile of pseudo-label samples. Only valid when *pseudo_labeling_strategy* is 'quantile'.
* pseudo_labeling_sample_number:
    Expected number to sample per class. Only valid when *pseudo_labeling_strategy* is 'number'.
* pseudo_labeling_resplit : bool, (default=False)
    Whether to re-split the training set and evaluation set after adding pseudo-labeled data. If False, the
    pseudo-labeled data is only appended to the training set. Only valid when *pseudo_labeling* is True.

In [3]:
experiment = make_experiment(train_data.copy(), test_data=X_test.copy(), target='Class',
                             random_state=8888, max_trials=10, early_stopping_rounds=0,
                             pseudo_labeling=True,
                             )
estimator = experiment.run()


ExperimentProcessWidget(initData='{"steps": [{"index": 0, "name": "data_clean", "type": "DataCleanStep", "stat…

In [4]:
scorer = metric_to_scoring('accuracy')
score = scorer(estimator, X_test, y_test)
score

0.7394847528916929