In [1]:
import numpy as np
import sklearn.svm as svm
from sklearn.externals import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score




In [2]:
class TSVM(object):
    def __init__(self):
        pass
    
    def initial(self, kernel='linear'):
        self.Cl, self.Cu = 1.5, 0.001
        self.kernel = kernel
        self.clf = svm.LinearSVC(C=1.5)
    
    def load(self, model_path='./TVSM.model'):
        self.clf = joblib.load(model_path)
    
    def train(self, X1, Y1, X2):
        N = len(X1) + len(X2)
        sample_weight = np.ones(N)
        sample_weight[len(X1):] = self.Cu
        
        self.clf.fit(X1, Y1)
        
        Y2 = self.clf.predict(X2)
        Y2 = np.expand_dims(Y2, 1)
        X2_id = np.arange(len(X2))
        X3 = np.vstack([X1, X2])
        Y3 = np.vstack([Y1, Y2])
        
        while self.Cu < self.Cl:
            self.clf.fit(X3, Y3, sample_weight=sample_weight)
            while True:
                Y2_d = self.clf.decision_function(X2)
                Y2 = Y2.reshape(-1)
                epsilon = 1 - Y2 * np.argmax(Y2_d)   # calculate function margin
                edible_set, edible_id = epsilon[Y2 == 0], X2_id[Y2 == 0]
                poison_set, poison_id = epsilon[Y2 > 0], X2_id[Y2 > 0]
                edible_max_id = edible_id[np.argmax(edible_set)]
                poison_max_id = poison_id[np.argmax(poison_set)]
                a, b = epsilon[edible_max_id], epsilon[poison_max_id]
                if a > 0 and b > 0 and a + b > 2.0:
                    Y2[edible_max_id] = Y2[edible_max_id] * -1
                    Y2[poison_max_id] = Y2[poison_max_id] * -1
                    Y2 = np.expand_dims(Y2, 1)
                    Y3 = np.vstack([Y1, Y2])
                    self.clf.fit(X3, Y3, sample_weight=sample_weight)
                else:
                    break
            self.Cu = min(2*self.Cu, self.Cl)
            sample_weight[len(X1):] = self.Cu
            
    def score(self, X, Y):
        '''
        Calculate accuracy of TSVM by X, Y

        Parameters
        ----------
        X: Input data
                np.array, shape:[n, m], n: numbers of samples, m: numbers of features
        Y: labels of X
                np.array, shape:[n, ], n: numbers of samples

        Returns
        -------
        Accuracy of TSVM
                float
        '''
        return self.clf.score(X, Y)

    def predict(self, X):
        '''
        Feed X and predict Y by TSVM

        Parameters
        ----------
        X: Input data
                np.array, shape:[n, m], n: numbers of samples, m: numbers of features

        Returns
        -------
        labels of X
                np.array, shape:[n, ], n: numbers of samples
        '''
        return self.clf.predict(X)

    def save(self, path='./TSVM.model'):
        '''
        Save TSVM to model_path

        Parameters
        ----------
        model_path: model path of TSVM
                        model should be svm in sklearn
        '''
        joblib.dump(self.clf, path)

In [45]:
model = TSVM()
model.initial()

df = pd.read_csv(r"C:\NTU\CZ4041 ML\mushroom data\mushroom-ssl40-10-10tst.csv")
df = np.array(df)
X1 = df[0:58, :-1] #10% Labeled data
Y1 = df[0:58:, -1]
X2 = df[58:580, :-1] #90% Unlabeled data

dframe1 = pd.DataFrame(data=X1[0:,0:],
                      index=[i for i in range(X1.shape[0])],
                      columns=['f'+str(i) for i in range(X1.shape[1])])
X1_nu = dframe1.apply(LabelEncoder().fit_transform)

dframe2 = pd.DataFrame(data=X2[0:,0:],
                      index=[i for i in range(X2.shape[0])],
                      columns=['f'+str(i) for i in range(X2.shape[1])])
X2_nu = dframe2.apply(LabelEncoder().fit_transform)

Y1 = np.expand_dims(Y1, 1)
dframe3 = pd.DataFrame(data=Y1[0:,0:],
                      index=[i for i in range(Y1.shape[0])],
                      columns=['f'+str(i) for i in range(Y1.shape[1])])
Y1_nu = dframe3.apply(LabelEncoder().fit_transform)
#print(Y1_nu)

model.train(X1_nu, Y1_nu, X2_nu)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [46]:
Y_hat = model.predict(X2_nu)
accuracy = model.score(X2_nu, Y_hat)

Y2 = df[58:580:, -1]
Y2_nu = LabelEncoder().fit_transform(Y2)

pred_edi = 0
pred_poi = 0
false_edi = 0
false_poi = 0

Y1_nu = np.vstack([Y1_nu])
Y1_nu = Y1_nu.reshape(-1)

for i in range(len(Y1_nu)):
    if Y1_nu[i] == 0:
        pred_edi += 1
    else:
        pred_poi += 1
        
for i in range(len(Y2_nu)):
    if Y2_nu[i] == 0:
        if Y_hat[i] == 0:
            pred_edi += 1
        else:
            false_edi += 1
    else:
        if Y_hat[i] == 1:
            pred_poi += 1
        else:
            false_poi += 1
            
misclass = false_edi + false_poi
accuracy = (pred_edi + pred_poi)/580 * 100

print("misclassified: " + str(misclass))
print("accurary : " + str(accuracy) + " % ")

misclassified: 215
accurary : 62.93103448275862 % 


In [47]:
from IPython.display import display, HTML
from sklearn.metrics import confusion_matrix

dframe_pa = pd.DataFrame(data=[(pred_edi, false_edi), (false_poi, pred_poi)],
                      index=('actual_edi','actual_poi'),
                      columns=('pred_edi','pred_poi'))
display(dframe_pa)

cm = confusion_matrix(Y2_nu, Y_hat)



Unnamed: 0,pred_edi,pred_poi
actual_edi,339,0
actual_poi,215,26
