<a href="https://colab.research.google.com/github/marekpiotradamczyk/ml_uwr_22/blob/main/kmeans_deep_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# How it differs from the default solution? 



# Load data & default imports

In [15]:
import os
import sys
import numpy as np
import pandas as pd
import scipy.stats as sstats
from sklearn import datasets
import sklearn.linear_model
from tqdm.auto import tqdm
from matplotlib import animation, pyplot, rc
import matplotlib.pyplot as plt
import httpimport
from PIL import Image

In [6]:
!pip install -q gdown httpimport
![ -e cifar.npz ] || gdown 'https://drive.google.com/uc?id=1oBzZdtg2zNTPGhbRy6DQ_wrf5L5OAhNR' -O cifar.npz

Downloading...
From: https://drive.google.com/uc?id=1oBzZdtg2zNTPGhbRy6DQ_wrf5L5OAhNR
To: /Users/denys/dev/cifar/cifar.npz
100%|████████████████████████████████████████| 185M/185M [00:07<00:00, 24.1MB/s]


In [7]:
with np.load('cifar.npz') as data:
    cifar_train_data = data['train_data']
    cifar_train_labels = data['train_labels']
    cifar_test_data = data['test_data']
    cifar_test_labels = data['test_labels']



In [8]:
X_trn = cifar_train_data
y_trn = cifar_train_labels
X_tst = cifar_test_data
y_tst = cifar_test_labels

# Deep Features

## find important patterns in patches

In [10]:
def contrast(image):
    return (image-image.min())/(image.max() - image.min())

In [11]:
def normalize_patch(patch, eps=10):
    return (patch - patch.mean())/np.sqrt(patch.var() + eps)


In [12]:
def whiten(X):
    X_norm = (X - X.mean(axis=0))/X.std(axis=0)
    cov = np.cov(X_norm, rowvar=False) 
    U,S,V = np.linalg.svd(cov)

    X_zca = U.dot(np.diag(1.0/np.sqrt(S + 0.1))).dot(U.T).dot(X_norm.T).T
    return X_zca

In [17]:
PATCH_SIZE = 4        # SMALLLER PATCHES
patch_num  = 1000000  # MORE PATCHES
STRIDE     = 2        # SMALLER STRIDE, THEREFORE MORE DENSE COVER

In [27]:
patches = []

X_trn_reshaped = X_trn.reshape(-1,32,32,3)

for i in range(patch_num):
    for r in range(32-PATCH_SIZE+1):
        for c in range(32-PATCH_SIZE+1):
            patch = X_trn_reshaped[i][c:(c+PATCH_SIZE),r:(r+PATCH_SIZE)].flatten()
            patch_norm = normalize_patch(patch, eps=10)
            patches.append(patch_norm)
            if len(patches) % 100000 == 0:
                print(len(patches))
    if len(patches) >= patch_num:
        break

print("Done!")

  0%|                                | 131/1000000 [00:01<2:30:32, 110.69it/s]

100000


  0%|                                | 260/1000000 [00:02<2:33:48, 108.34it/s]

200000


  0%|                                | 376/1000000 [00:03<2:32:22, 109.34it/s]

300000


  0%|                                | 492/1000000 [00:04<2:31:49, 109.72it/s]

400000


  0%|                                | 609/1000000 [00:05<2:31:13, 110.15it/s]

500000


  0%|                                | 729/1000000 [00:06<2:31:17, 110.09it/s]

600000


  0%|                                | 847/1000000 [00:07<2:29:27, 111.43it/s]

700000


  0%|                                | 967/1000000 [00:08<2:29:01, 111.73it/s]

800000


  0%|                               | 1087/1000000 [00:09<2:28:53, 111.82it/s]

900000


  0%|                               | 1189/1000000 [00:10<2:30:59, 110.25it/s]

1000000
Done!





In [22]:
P = np.vstack(patches)
P_zca = whiten(P)

(1000790, 48)

# Clusters

In [26]:
kroot = 16
k = kroot*kroot

In [27]:
from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np
kmeans = MiniBatchKMeans(n_clusters=k, random_state=0, verbose=False, n_init=1, max_iter=200, batch_size=10000)
kmeans.fit(P_zca)
kmeans.labels_
kmeans.cluster_centers_


Init 1/1 with method k-means++
Inertia for init 1/1: 450959.8593846854
[MiniBatchKMeans] Reassigning 9 cluster centers.
Minibatch step 1/20015: mean batch inertia: 14.98315510146144
[MiniBatchKMeans] Reassigning 5 cluster centers.
Minibatch step 2/20015: mean batch inertia: 13.478464989976846, ewa inertia: 13.478464989976846
[MiniBatchKMeans] Reassigning 1 cluster centers.
Minibatch step 3/20015: mean batch inertia: 13.199475055044674, ewa inertia: 13.472889601410559
Minibatch step 4/20015: mean batch inertia: 13.092389749776968, ewa inertia: 13.465285619127874
Minibatch step 5/20015: mean batch inertia: 13.1286186684535, ewa inertia: 13.458557601975954
Minibatch step 6/20015: mean batch inertia: 12.873415947754966, ewa inertia: 13.44686401851605
Minibatch step 7/20015: mean batch inertia: 12.892793789640809, ewa inertia: 13.435791372401622
Minibatch step 8/20015: mean batch inertia: 12.953167849882572, ewa inertia: 13.426146531020775
Minibatch step 9/20015: mean batch inertia: 12.9786

array([[ 0.14222536,  0.36211942, -0.27262611, ...,  0.43261444,
         0.71349216, -0.20196903],
       [ 0.272288  ,  0.28599981,  0.34276708, ...,  0.21279491,
         0.24887226,  0.38089255],
       [ 0.05226896, -0.20149974,  0.12568901, ..., -0.03864519,
         0.41419189, -0.50734431],
       ...,
       [ 0.65532694,  0.72854747,  0.60294978, ...,  0.03079274,
        -0.03002095, -0.0248214 ],
       [-0.00651118,  0.15903488,  0.37014659, ..., -0.07843439,
         0.12570746,  0.4495836 ],
       [-0.39078334, -0.50727402, -0.41144376, ...,  0.42162121,
         0.47255575,  0.52072806]])

In [28]:
filters_final = kmeans.cluster_centers_
filters_final.shape

(256, 48)

# Some intuition what the patches are

In [9]:
plt.figure(figsize=(kroot,kroot))
for xx in range(k):
    plt.subplot(kroot,kroot,xx+1)
    plt.imshow((contrast(filters_final[xx])).reshape(PATCH_SIZE, PATCH_SIZE, 3))
    plt.axis('off')
    

NameError: name 'kroot' is not defined

# Transform dataset according to the patterns found in patches




In [30]:
def dist(x,y):
  return np.sqrt((x - y).dot(x-y))
  #return (x-y).dot(x-y)
    
def create_patch_features(X):    
    X_mapped_list_per_image = []
    for i in range(X.shape[0]):
      if i % 1000 == 0:
        print(i,"/", X.shape[0])
        
      mapped_features = []
      for r in range(0, 32-PATCH_SIZE+1, STRIDE):
        for c in range(0, 32-PATCH_SIZE+1, STRIDE):
          patch = X[i].reshape(32,32,3)[c:(c+PATCH_SIZE),r:(r+PATCH_SIZE)].flatten()
          patch_norm = normalize_patch(patch, eps=0.01)
          mapped_features.append([dist(patch_norm, f) for f in filters_final])
      X_mapped_list_per_image.append(np.vstack(mapped_features))
    X_mapped = np.asarray(X_mapped_list_per_image).reshape(-1, ((32-PATCH_SIZE)//STRIDE+1)**2*filters_final.shape[0])
    return X_mapped

In [31]:
from sklearn.feature_extraction import image
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances

def create_patch_features__vectorized(X):    
    X_mapped_list_per_image = []
    for i in range(X.shape[0]):
        if i % 5000 == 0:
            print(i,"/", X.shape[0])
        patches = image.extract_patches_2d(X[i], (PATCH_SIZE, PATCH_SIZE))
        strided_patches = patches.reshape( 32-PATCH_SIZE+1 , 32-PATCH_SIZE+1, PATCH_SIZE, PATCH_SIZE, 3)[::STRIDE,::STRIDE,:,:,:]
        strided_patches = strided_patches.reshape(((32-PATCH_SIZE)//STRIDE+1)**2, PATCH_SIZE * PATCH_SIZE * 3)
        mapped_features = euclidean_distances(np.asarray([normalize_patch(patch, eps=0.01) for patch in strided_patches]), filters_final)
        X_mapped_list_per_image.append(mapped_features.reshape(((32-PATCH_SIZE)//STRIDE+1)**2 * filters_final.shape[0]))
    X_mapped = np.asarray(X_mapped_list_per_image)
    return X_mapped

In [32]:
X_mapped_trn = create_patch_features__vectorized(X_trn)

0 / 50000
5000 / 50000
10000 / 50000
15000 / 50000
20000 / 50000
25000 / 50000
30000 / 50000
35000 / 50000
40000 / 50000
45000 / 50000


In [33]:
X_mapped_trn.shape

(50000, 16384)

In [34]:
X_mapped_tst = create_patch_features__vectorized(X_tst)

0 / 10000
5000 / 10000


In [35]:
X_mapped_tst.shape

(10000, 16384)

# Logistic Regression on mapped features

In [36]:
X_mapped_trn_norm = (X_mapped_trn - X_mapped_trn.mean(axis=0))/X_mapped_trn.std(axis=0)

In [37]:
X_mapped_tst_norm = (X_mapped_tst - X_mapped_trn.mean(axis=0))/X_mapped_trn.std(axis=0)

In [3]:
from scipy.stats import ks_2samp
statistic, p_value = ks_2samp(X_mapped_trn_norm, X_mapped_tst_norm)

NameError: name 'X_mapped_trn_norm' is not defined

In [41]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter=200, n_jobs=-1, verbose=False).fit(X_mapped_trn_norm, y_trn.flatten())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Results! 😊

In [42]:
y_trn_pred = clf.predict(X_mapped_trn_norm)
(y_trn.flatten() == y_trn_pred).mean()

0.72046

In [43]:
y_tst_pred = clf.predict(X_mapped_tst_norm)
(y_tst.flatten() == y_tst_pred).mean()

0.557