In [11]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

from tensorflow.keras.datasets import fashion_mnist
from sklearn.decomposition import PCA

In [12]:
(x_train_pics, y_train), (x_test_pics, y_test) = fashion_mnist.load_data()
x_train = x_train_pics.reshape(x_train_pics.shape[0], -1)
x_test = x_test_pics.reshape(x_test_pics.shape[0], -1)

# (x_train_pics, y_train), (x_test_pics, y_test) = fashion_mnist.load_data()
# y_train = y_train[:100]
# y_test = y_test[:100]
# x_train = x_train_pics.reshape(x_train_pics.shape[0], -1) [:100]
# x_test = x_test_pics.reshape(x_test_pics.shape[0], -1) [:100]
# x_train.shape

(100, 784)

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin


class RFFPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=1000, new_dim=50, use_PCA=True, classifier='logreg',
                kernel='linear'):
        """        
        Implements pipeline, which consists of PCA decomposition,
        Random Fourier Features approximation and linear classification model.
        
        n_features, int: amount of synthetic random features generated with RFF approximation.

        new_dim, int: PCA output size.
        
        use_PCA, bool: whether to include PCA preprocessing.
        
        classifier, string: either 'svm' or 'logreg', a linear classification model to use on top of pipeline.
        
        Feel free to edit this template for your preferences.    
        """
        self.n_features = n_features
        self.use_PCA = use_PCA
        self.new_dim = new_dim
        self.classifier = classifier
        self.kernel = kernel
        
        self.sigma = np.nan
        self.w = np.nan
        self.b = np.nan
        self.pca = PCA(n_components=self.new_dim)
        
        if classifier=='logreg':
            self.estimator = LogisticRegression()
        elif classifier=='svm':
            self.estimator = SVC(decision_function_shape='ovr', kernel=self.kernel) 
        
    def compute_sigma(self, x, n_pairs):
        x_i = x[np.random.randint(low=0, high=x.shape[0], size=n_pairs, dtype=int), :]
        x_j = x[np.random.randint(low=0, high=x.shape[0], size=n_pairs, dtype=int), :]
        return np.median(np.sum((x_i - x_j)**2, axis=1))
        
    def fit(self, x, y, n_pairs=100000):
        if self.use_PCA == True:
            x = self.pca.fit_transform(x)
        self.sigma = self.compute_sigma(x, n_pairs)
        self.w = np.random.normal(loc=0, scale=1/np.sqrt(self.sigma), \
                             size=(self.n_features, x.shape[1]))
        self.b = np.random.uniform(low=-np.pi, high=np.pi, size=self.n_features)
        new_obj = np.cos(np.transpose(self.w @ np.transpose(x)) + self.b)
        return self.estimator.fit(new_obj, y)
        """
        Fit all parts of algorithm (PCA, RFF, Classification) to training set.
        """
        # Your code here: (￣▽￣)/♫•*¨*•.¸¸♪
#         raise NotImplementedError

    def predict_proba(self, x):
        """
        Apply pipeline to obtain scores for input data.
        """
        # Your code here: (￣▽￣)/♫•*¨*•.¸¸♪
#         raise NotImplementedError
        if self.use_PCA == True:
            x = self.pca.transform(x)
        new_obj = np.cos(np.transpose(self.w @ np.transpose(x)) + self.b)
        return self.estimator.predict_proba(new_obj)
        
    def predict(self, x):
        """
        Apply pipeline to obtain discrete predictions for input data.
        """
#             self.estimator.fit(X_train_pca) 
        # Your code here: (￣▽￣)/♫•*¨*•.¸¸♪
#         raise NotImplementedError
        if self.use_PCA == True:
            x = self.pca.transform(x)
        new_obj = np.cos(np.transpose(self.w @ np.transpose(x)) + self.b)
        return self.estimator.predict(new_obj)

In [17]:
%%time 

for new_dim in [30, 50, 100, 200]:
    for n_features in [500, 1000, 5000, 10000]:
        pipeline_custom = RFFPipeline(use_PCA=True, new_dim=new_dim, n_features=n_features, \
                                  classifier='svm', kernel='rbf')
        pipeline_custom.fit(x_train, y_train)
        prediction = pipeline_custom.predict(x_test)
        print('accuracy_score for new_dim=%d and n_features=%d is %d' \
              % (new_dim, n_features, accuracy_score(y_test, prediction)))
        print('f1_score for new_dim=%d and n_features=%d is %d' \
              % (new_dim, n_features, f1_score(y_test, prediction, average='macro')))

accuracy_score for new_dim=30 and n_features=500 is 0
f1_score for new_dim=30 and n_features=500 is 0
accuracy_score for new_dim=30 and n_features=1000 is 0
f1_score for new_dim=30 and n_features=1000 is 0
accuracy_score for new_dim=30 and n_features=5000 is 0
f1_score for new_dim=30 and n_features=5000 is 0
accuracy_score for new_dim=30 and n_features=10000 is 0
f1_score for new_dim=30 and n_features=10000 is 0
accuracy_score for new_dim=50 and n_features=500 is 0
f1_score for new_dim=50 and n_features=500 is 0
accuracy_score for new_dim=50 and n_features=1000 is 0
f1_score for new_dim=50 and n_features=1000 is 0
accuracy_score for new_dim=50 and n_features=5000 is 0
f1_score for new_dim=50 and n_features=5000 is 0
accuracy_score for new_dim=50 and n_features=10000 is 0
f1_score for new_dim=50 and n_features=10000 is 0
accuracy_score for new_dim=100 and n_features=500 is 0
f1_score for new_dim=100 and n_features=500 is 0
accuracy_score for new_dim=100 and n_features=1000 is 0
f1_score

ValueError: n_components=200 must be between 0 and min(n_samples, n_features)=100 with svd_solver='full'

In [18]:
%%time 

for new_dim in [30, 50, 100, 200]:
    for n_features in [500, 1000, 5000, 10000]:

        pipeline_custom = RFFPipeline(use_PCA=True, new_dim=new_dim, n_features=n_features, \
                                      classifier='logreg', kernel='rbf')
        pipeline_custom.fit(x_train, y_train)
        prediction = pipeline_custom.predict(x_test)
        print('accuracy_score for new_dim=%d and n_features=%d is %d' \
              % (new_dim, n_features, accuracy_score(y_test, prediction)))
        print('f1_score for new_dim=%d and n_features=%d is %d' \
              % (new_dim, n_features, f1_score(y_test, prediction, average='macro')))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy_score for new_dim=30 and n_features=500 is 0
f1_score for new_dim=30 and n_features=500 is 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy_score for new_dim=30 and n_features=1000 is 0
f1_score for new_dim=30 and n_features=1000 is 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy_score for new_dim=30 and n_features=5000 is 0
f1_score for new_dim=30 and n_features=5000 is 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy_score for new_dim=30 and n_features=10000 is 0
f1_score for new_dim=30 and n_features=10000 is 0
accuracy_score for new_dim=50 and n_features=500 is 0
f1_score for new_dim=50 and n_features=500 is 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy_score for new_dim=50 and n_features=1000 is 0
f1_score for new_dim=50 and n_features=1000 is 0


KeyboardInterrupt: 

In [24]:
%%time 

pca = PCA(n_components=100)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)
for kernel in ('linear', 'poly', 'rbf' ,'sigmoid'):
    svc = SVC(C=1.0, kernel=kernel, degree=4, gamma='scale', \
        coef0=0.0, decision_function_shape='ovr', verbose=5)
    svc.fit(x_train_pca, y_train)
    prediction = svc.predict(x_test_pca)
    print('accuracy_score for %s is %d' % (kernel, accuracy_score(y_test, prediction)))
    print('f1_score for %s is %d' % (kernel, f1_score(y_test, prediction, average='macro')))

[LibSVM]accuracy_score for linear is 0
f1_score for linear is 0
[LibSVM]accuracy_score for poly is 0
f1_score for poly is 0
[LibSVM]accuracy_score for rbf is 0
f1_score for rbf is 0
[LibSVM]accuracy_score for sigmoid is 0
f1_score for sigmoid is 0
CPU times: user 92.1 ms, sys: 7.48 ms, total: 99.5 ms
Wall time: 51.7 ms
