In [1]:
from sklearn.naive_bayes import GaussianNB

import self_train
import co_training
import util
import co_training_extension
import importlib
importlib.reload(self_train)
importlib.reload(co_training)
importlib.reload(util)
importlib.reload(co_training_extension)

from sklearn import naive_bayes
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn import datasets as ds
import numpy as np
from sklearn.semi_supervised import SelfTrainingClassifier

In [2]:
# load datasets
datasets = {}

temp = ds.load_breast_cancer()
datasets["cancer"] = [np.asarray(temp.data), np.asarray(temp.target)]
temp = ds.load_wine()
# turning a multi-class classification problem into a binary classification problem
data = np.asarray(temp.data)
target = np.asarray(temp.target)
mask = target != 2
data = data[mask]
target = target[mask]
datasets["wine"] = [data, target]

# load models
rfc = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=42)
knn = KNeighborsClassifier(3)
gnb = GaussianNB()
dtr = tree.DecisionTreeClassifier(max_depth=5)

# load classifiers
cotrain = co_training.CoTrain([gnb, knn, dtr])
selftrain = self_train.SelfTrain(rfc, criterion='threshold')
coview = co_training_extension.CoTrainView(rfc)
classifiers = {"Self-Training" : selftrain, "Co-Training" : cotrain, "Co-Training Multi-View" : coview}

In [None]:
# run experiments
percent_values = [.20, .40, .60, .80, .90, .95]  # percent of the data to be unlabeled
numTrials = 5  # number of trials to run (more trials for smoother data but longer processing time)
for clf_name, clf in classifiers.items():
    print("\nfor classifier: ", clf_name)
    for dataset_name, dataset in datasets.items():
        print("\ndataset: ", dataset_name)
        data = dataset[0]
        target = dataset[1]
        #  this dataset is too big for the storage complexity of co-training multi-view
        if clf_name == "Co-Training Multi-View" and dataset_name == "credit card fraud":
            continue
        
        print("%unlabeled\tacc before\tacc after\tprecision before\tprecision after")
        for percent in percent_values:
            total_stats = np.zeros(4)
            for i in range(numTrials):
                try:
                    stats = util.performAnalysis(clf, data, target, percent, output=False)
                    total_stats = np.add(total_stats, stats)
                except ValueError:
                    # occasionally, not enough datapoints are given labels. In this case, we just don't add anything to total_stats and try again (adding 1 to number of trials)
                    numTrials += 1
            # average the stats
            total_stats = total_stats / numTrials
            to_print = f'{percent:.2f}\t{(total_stats[0]):.4f}\t{(total_stats[1]):.4f}\t{(total_stats[2]):.4f}\t{(total_stats[3]):.4f}'
            print(to_print)


for classifier:  Self-Training

dataset:  cancer
%unlabeled	acc before	acc after	precision before	precision after
0.20	0.9503	0.9525	0.9711	0.9725
0.40	0.9484	0.9469	0.9727	0.9707
0.60	0.9439	0.9401	0.9685	0.9618
0.80	0.9391	0.9387	0.9542	0.9622
0.90	0.9317	0.9343	0.9519	0.9613
0.95	0.9135	0.9141	0.9217	0.9275

dataset:  wine
%unlabeled	acc before	acc after	precision before	precision after
0.20	0.9620	0.9488	0.9585	0.9589
0.40	0.9643	0.9630	0.9697	0.9689
0.60	0.9485	0.9459	0.9385	0.9489
0.80	0.9090	0.9093	0.8844	0.8908
0.90	0.8530	0.8317	0.8374	0.8541
0.95	0.7431	0.6960	0.6889	0.6724

for classifier:  Co-Training

dataset:  cancer
%unlabeled	acc before	acc after	precision before	precision after
0.20	0.9541	0.9559	0.9801	0.9777
0.40	0.9514	0.9499	0.9733	0.9729
0.60	0.9475	0.9449	0.9694	0.9703
0.80	0.9407	0.9386	0.9710	0.9727
0.90	0.9354	0.9357	0.9662	0.9694
0.95	0.9175	0.9207	0.9552	0.9592

dataset:  wine
%unlabeled	acc before	acc after	precision before	precision after
0.20	0.9735	0.97