In [5]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

from tensorflow.keras.datasets import fashion_mnist
from sklearn.decomposition import PCA

In [6]:
(x_train_pics, y_train), (x_test_pics, y_test) = fashion_mnist.load_data()
x_train = x_train_pics.reshape(x_train_pics.shape[0], -1)
x_test = x_test_pics.reshape(x_test_pics.shape[0], -1)

In [44]:
from sklearn.base import BaseEstimator, TransformerMixin


class RFFPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=1000, new_dim=50, use_PCA=True, classifier='logreg',
                kernel='linear'):
        """        
        Implements pipeline, which consists of PCA decomposition,
        Random Fourier Features approximation and linear classification model.
        
        n_features, int: amount of synthetic random features generated with RFF approximation.

        new_dim, int: PCA output size.
        
        use_PCA, bool: whether to include PCA preprocessing.
        
        classifier, string: either 'svm' or 'logreg', a linear classification model to use on top of pipeline.
        
        Feel free to edit this template for your preferences.    
        """
        self.n_features = n_features
        self.use_PCA = use_PCA
        self.new_dim = new_dim
        self.classifier = classifier
        self.kernel = kernel
        
        self.sigma = np.nan
        self.w = np.nan
        self.b = np.nan
        self.pca = PCA(n_components=self.new_dim)
        
        if classifier=='logreg':
            self.estimator = LogisticRegression()
        elif classifier=='svm':
            self.estimator = SVC(decision_function_shape='ovr', kernel=self.kernel) 
        
    def compute_sigma(self, x, n_pairs):
        x_i = x[np.random.randint(low=0, high=x.shape[0], size=n_pairs, dtype=int), :]
        x_j = x[np.random.randint(low=0, high=x.shape[0], size=n_pairs, dtype=int), :]
        return np.median(np.sum((x_i - x_j)**2, axis=1))
        
    def fit(self, x, y, n_pairs=100000):
        if self.use_PCA == True:
            x = self.pca.fit_transform(x)
        self.sigma = self.compute_sigma(x, n_pairs)
        
        self.w = np.random.normal(loc=0, scale=1/np.sqrt(self.sigma), \
                             size=(self.n_features, x.shape[1]))
        
#         w_gauss = np.random.normal(loc=0, scale=1, \
#                              size=(self.n_features, x.shape[1]))
#         print('w_gauss', w_gauss.shape)
#         q = np.linalg.qr(w_gauss, mode='complete')[1]
#         print('q', q.shape)
#         s = np.diag(np.random.chisquare(df=self.n_features, size=self.n_features))
#         print('s', s.shape)
#         self.w = (1/np.sqrt(self.sigma)) * s @ q
#         print('w', self.w.shape)
        
        self.b = np.random.uniform(low=-np.pi, high=np.pi, size=self.n_features)
        new_obj = np.cos(np.transpose(self.w @ np.transpose(x)) + self.b)
        return self.estimator.fit(new_obj, y)
        """
        Fit all parts of algorithm (PCA, RFF, Classification) to training set.
        """
        # Your code here: (￣▽￣)/♫•*¨*•.¸¸♪
#         raise NotImplementedError

    def predict_proba(self, x):
        """
        Apply pipeline to obtain scores for input data.
        """
        # Your code here: (￣▽￣)/♫•*¨*•.¸¸♪
#         raise NotImplementedError
        if self.use_PCA == True:
            x = self.pca.transform(x)
        new_obj = np.cos(np.transpose(self.w @ np.transpose(x)) + self.b)
        return self.estimator.predict_proba(new_obj)
        
    def predict(self, x):
        """
        Apply pipeline to obtain discrete predictions for input data.
        """
#             self.estimator.fit(X_train_pca) 
        # Your code here: (￣▽￣)/♫•*¨*•.¸¸♪
#         raise NotImplementedError
        if self.use_PCA == True:
            x = self.pca.transform(x)
        new_obj = np.cos(np.transpose(self.w @ np.transpose(x)) + self.b)
        return self.estimator.predict(new_obj)

In [45]:
# accuracy_score is 0.7697
# f1_score is  0.7682287387877702

pipeline_custom = RFFPipeline(use_PCA=True, new_dim=30, n_features=1000, \
                              classifier='svm', kernel='rbf')
pipeline_custom.fit(x_train[:1000], y_train[:1000])
prediction = pipeline_custom.predict(x_test)
print('accuracy_score is', accuracy_score(y_test, prediction))
print('f1_score is ', f1_score(y_test, prediction, average='macro'))

w_gauss (1000, 30)
q (1000, 30)
s (1000, 1000)
w (1000, 30)
accuracy_score is 0.1
f1_score is  0.01818181818181818


In [24]:
a = np.array([[1,2,3], [5,6,7]])
print(a)
print(np.linalg.qr(a)[1].shape)

[[1 2 3]
 [5 6 7]]
(2, 3)


In [11]:
b = np.array([1,2,3])
np.diag(b)

array([[1, 0, 0],
       [0, 2, 0],
       [0, 0, 3]])