In [8]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import numpy as np

In [9]:
X,y = make_classification(random_state = 42)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
pipeline = make_pipeline(StandardScaler(), LogisticRegression())

In [12]:
pipeline.fit(x_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [13]:
pipeline.score(x_test, y_test)

1.0

In [14]:
pipeline2 = make_pipeline(MinMaxScaler(), LogisticRegression())
pipeline2.fit(x_train, y_train)
pipeline2.score(x_test, y_test)

0.9

## Label propagation

In [9]:
from sklearn.semi_supervised import LabelPropagation
from sklearn import datasets
import numpy as np
from sklearn.metrics import confusion_matrix,classification_report

In [10]:
label_prop_model = LabelPropagation()

In [11]:
iris = datasets.load_iris()

In [12]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [13]:
rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
labels = np.copy(iris.target)
labels[random_unlabeled_points] = -1
label_prop_model.fit(iris.data, labels)

LabelPropagation()

In [14]:
np.unique(labels, return_counts=True)

(array([-1,  0,  1,  2]), array([51, 31, 35, 33]))

In [15]:
label_prop_model.transduction_ == iris.target

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [16]:
pred = label_prop_model.predict(iris.data[random_unlabeled_points])

In [17]:
iris.target[random_unlabeled_points]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2])

In [18]:
print(classification_report(pred, iris.target[random_unlabeled_points]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.87      1.00      0.93        13
           2       1.00      0.89      0.94        19

    accuracy                           0.96        51
   macro avg       0.96      0.96      0.96        51
weighted avg       0.97      0.96      0.96        51



In [22]:
from sklearn.semi_supervised import LabelSpreading
label_prop_model = LabelSpreading()
iris = datasets.load_iris()
rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
labels = np.copy(iris.target)
labels[random_unlabeled_points] = -1
label_prop_model.fit(iris.data, labels)
preds = label_prop_model.predict(iris.data[random_unlabeled_points])
orig_labels = iris.target[random_unlabeled_points]
print(classification_report(preds,orig_labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.87      1.00      0.93        13
           2       1.00      0.89      0.94        19

    accuracy                           0.96        51
   macro avg       0.96      0.96      0.96        51
weighted avg       0.97      0.96      0.96        51



### Self-supervised

In [53]:
import numpy as np
from sklearn import datasets
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.svm import SVC
rng = np.random.RandomState(42)
iris = datasets.load_iris()
random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3
iris.target[random_unlabeled_points] = -1
svc = SVC(probability=True, gamma="auto")
self_training_model = SelfTrainingClassifier(svc)
self_training_model.fit(iris.data, iris.target)

SelfTrainingClassifier(base_estimator=SVC(gamma='auto', probability=True))

In [54]:
svc

SVC(gamma='auto', probability=True)

In [56]:
from sklearn import datasets
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

rnd = np.random.RandomState(42)
# load the cancer dataset
cancer = datasets.load_breast_cancer()

# Randomly unlabel some records in the dataset
#unlabeled points are marked as -1
random_unlabeled_points = rnd.rand(len(cancer.target)) < 0.6
print(random_unlabeled_points.shape)
labels = np.copy(cancer.target)
labels_orig = np.copy(cancer.target)
labels[random_unlabeled_points] = -1
X=cancer.data
tot_unlabled=labels[labels==-1]
print(f"Total Records in dataset is {len(X)} and unlabeled records is  {len(tot_unlabled)}")

# define model
model = LabelPropagation(kernel='knn',n_neighbors=5, gamma=30, max_iter=2000)
# fit model on training dataset
model.fit(X, labels)
# make predictions
predicted_labels = model.predict(X[random_unlabeled_points])
true_labels = labels_orig[random_unlabeled_points]

#print the classification report and confusion matrix
cm = confusion_matrix(true_labels, predicted_labels, labels=model.classes_)
print("Label propagation model: %d labeled & %d unlabeled points (%d total)" %
      (len(labels[labels!=-1]), len(tot_unlabled) , len(X)))

print(classification_report(true_labels, predicted_labels))
print("Confusion matrix")
print(cm)

(569,)
Total Records in dataset is 569 and unlabeled records is  328
Label propagation model: 241 labeled & 328 unlabeled points (569 total)
              precision    recall  f1-score   support

           0       0.56      0.96      0.70       128
           1       0.95      0.51      0.66       200

    accuracy                           0.69       328
   macro avg       0.75      0.74      0.68       328
weighted avg       0.80      0.69      0.68       328

Confusion matrix
[[123   5]
 [ 98 102]]


  self.label_distributions_ /= normalizer
