## Foundations of Data Science - 2020 - Exercise Sheet 11
### Exercise 11.1 (Application of the SVD)
#### In this exercise you shall play with the example from 
#### https://www.engr.uvic.ca/~seng474/svd.pdf  (section 3)

In [141]:
import numpy as np
import matplotlib.pyplot as plt

### (i) Reprogram it, denote by $k$ the used dimension.

In [138]:
def svd(A, k, scaling = True):
    S, sigma, UT = np.linalg.svd(A)
    Sk  = S[:, :k]
    UTk = UT[:k]
    
    if scaling:
        Sk  = Sk * sigma[:k]
        UTk = sigma[:k].reshape(k,-1) * UTk
    
    return Sk, UTk

def cosine(q, UT):
    cosineDist = q @ UT / (np.linalg.norm(q) * np.linalg.norm(UT, axis=0))    
    return cosineDist


def main(A, query, k=2, scaling=True, dist='cosine'):
    S, UT      = svd(A, k, scaling)
    q          = np.array([S[i] for i in query]).mean(axis=0)
    if 'cosine' == dist:
        return cosine(q, UT)
    elif 'euclid' == dist:
        return np.linalg.norm(UT - q.reshape(k, -1), axis=0)
    else:
        return -1

In [133]:
A = np.array([
#   d1,d2,d3,d4,d5
    [1, 0, 1, 0, 0],  # romeo
    [1, 1, 0, 0, 0],  # juliet
    [0, 1, 0, 0, 0],  # happy
    [0, 1, 1, 0, 0],  # dagger   <---
    [0, 0, 0, 1, 0],  # live
    [0, 0, 1, 1, 0],  # die      <---
    [0, 0, 0, 1, 0],  # free
    [0, 0, 0, 1, 1]]) # new - hampshire

A_pruned = np.array([
#   d1,d2,d3,d4,d5
    [1, 0, 1, 0, 0],  # romeo 
    [1, 1, 0, 0, 0],  # juliet
    [0, 1, 1, 0, 0],  # dagger
    [0, 0, 1, 1, 0],  # die
    [0, 0, 0, 1, 1]]) # new - hampshire

query        = [3, 5] # i.e. dagger and die
query_pruned = [3, 4]
D            = np.array(['d1', 'd2', 'd3', 'd4', 'd5'])

### (ii) Examine the resulting ranking if:

#### (a) $k \in \{2,3,4,5\}$.

In [140]:
for k in range(2, 6):
    out = main(A, query, k)
    ind = np.flip(np.argsort(out))
    
    print('\n k =', k)
    print('distances:', out)
    print('ranking  :', D[ind])


 k = 2
distances: [0.77279649 0.73067682 0.98443599 0.61873061 0.48491832]
ranking  : ['d3' 'd1' 'd2' 'd4' 'd5']

 k = 3
distances: [0.77908876 0.49623703 0.9206239  0.57426751 0.31817696]
ranking  : ['d3' 'd1' 'd4' 'd2' 'd5']

 k = 4
distances: [0.25983119 0.52140345 0.91539774 0.51737492 0.14161281]
ranking  : ['d3' 'd2' 'd4' 'd1' 'd5']

 k = 5
distances: [0.25757393 0.52089003 0.91297576 0.50990905 0.11282241]
ranking  : ['d3' 'd2' 'd4' 'd1' 'd5']


#### (b) scaling omitted.

In [135]:
out = main(A, query, scaling=False)
ind = np.flip(np.argsort(out))

print('distances:', out)
print('ranking  :', D[ind])

distances: [0.74094797 0.69742696 0.98037899 0.55485875 0.41876367]
ranking  : ['d3' 'd1' 'd2' 'd4' 'd5']


#### (c) single occurence words omitted.

In [136]:
out = main(A_pruned, query_pruned)
ind = np.flip(np.argsort(out))

print('distances:', out)
print('ranking  :', D[ind])

distances: [0.03616882 0.03616882 0.66694737 0.99287083 0.93362179]
ranking  : ['d4' 'd5' 'd3' 'd2' 'd1']


#### (d) Euclidean distance.

In [139]:
out = main(A, query, dist='euclid')
ind = np.flip(np.argsort(out))

print('distances:', out)
print('ranking  :', D[ind])

distances: [0.72095593 0.97896092 0.3809152  1.54593709 0.96806219]
ranking  : ['d4' 'd2' 'd5' 'd1' 'd3']
