In [5]:
%pylab inline
import glob
import spams
import pandas as pd
from numpy.linalg import pinv as moore_penrose_inv
from numpy.linalg import svd, eig
from tqdm import tqdm

Populating the interactive namespace from numpy and matplotlib


In [6]:
prefix = '/home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/'
for f in sorted(glob.glob('{}/*.tsv'.format(prefix))):
    df = pd.read_table(f)
    df = df.drop(columns=['target_id'])
    matrix = df.values
    colsize = matrix.shape[1]
    rank = matrix_rank(matrix)
    print('{} : Array colsize: {} | rank: {}'.format(f.split('/')[-1], colsize, rank))

SRP007412-gallus_gallus.tsv : Array colsize: 12 | rank: 12
SRP007412-gorilla_gorilla.tsv : Array colsize: 11 | rank: 11
SRP007412-homo_sapiens.tsv : Array colsize: 18 | rank: 18
SRP007412-macaca_mulatta.tsv : Array colsize: 13 | rank: 13
SRP007412-monodelphis_domestica.tsv : Array colsize: 12 | rank: 12
SRP007412-mus_musculus.tsv : Array colsize: 17 | rank: 17
SRP007412-ornithorhynchus_anatinus.tsv : Array colsize: 12 | rank: 12
SRP007412-pan_paniscus.tsv : Array colsize: 12 | rank: 12
SRP007412-pan_troglodytes.tsv : Array colsize: 15 | rank: 15
SRP007412-pongo_abelii.tsv : Array colsize: 9 | rank: 9
SRP016501-bos_taurus.tsv : Array colsize: 27 | rank: 27
SRP016501-gallus_gallus.tsv : Array colsize: 27 | rank: 27
SRP016501-macaca_mulatta.tsv : Array colsize: 27 | rank: 27
SRP016501-mus_musculus.tsv : Array colsize: 26 | rank: 26
SRP016501-rattus_norvegicus.tsv : Array colsize: 27 | rank: 27


In [31]:
def einsum_row_norm(a):
    return np.sqrt(np.einsum('ij,ij->i', a, a))

def einsum_col_norm(a):
    return np.sqrt(np.einsum('ij,ij->j', a, a))


def ho_SVD(list_of_X):
    """ Perform higher order SVD
    
    list_of_X: list
               [X_1, X_2, X_3, ..., X_n] size (n x m)
    
    """
    # Step 1
    # A_i = X_i^T X_i
    # A_inv = moore_penrose_inv(A_i)
    A = []
    N = len(list_of_X)
    A_inv = []
    for X in list_of_X:
        A_i = np.dot(X.T, X)
        A.append(A_i)
        A_inv.append(np.linalg.inv((A_i)))
    # Step 2
    # W = 1/N(N-1) * sum_{i=1}^jn sum_{j>i}^N A_i moore_penrose_inv(A_j) + A_j moore_penrose_inv(A_i)
    W = 0
    
    for i in range(N):
        for j in range(i+1, N):
            W = W + (np.dot(A[i], A_inv[j]) + np.dot(A[j], A_inv[i]))
    W =  W/(N*(N-1))
    
    # Step 3
    # Eigen decomposition of W?
    # WV = V\Lambda
    # V= (v_1, v_2, ..., v_n)
    # ||v||_k =1, but V is not necesarrily orthonormal
    # But we don't
    
    eigen_values, V = eig(W)

    idx = eigen_values.argsort()[::-1]   
    eigen_values = eigen_values[idx]
    V = V[:,idx]
    
    
    # Step 4
    # solve for  VB_i = X_i^T
     #Calculate the Bi matrices
   
    B = []
    U = []
    sigmas = []
    for X in list_of_X:
        #B_i = spams.omp( np.asfortranarray(X.T), V, lambda1=0, numThreads = 24)]
        # VB.T = X.T
        B_i = np.dot(np.linalg.inv(V), X.T).T
        B.append(B_i)
        sigma_i  = np.linalg.norm(B_i, axis=0) #einsum_col_norm(B)
        U_i = B_i/sigma_i 
        sigmas.append(sigma_i))
        U.append(U_i)
    angles = np.array()
    return U, sigmas, V
    

In [59]:
df = pd.read_table('/home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-rattus_norvegicus.tsv')
df = df.drop(columns=['target_id'])
matrix1 =  np.asfortranarray(df.values)

df = pd.read_table('/home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-gallus_gallus.tsv')
df = df.drop(columns=['target_id'])
matrix2 =  np.asfortranarray(df.values)

df = pd.read_table('/home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-bos_taurus.tsv')
df = df.drop(columns=['target_id'])
matrix3 =  np.asfortranarray(df.values)

df = pd.read_table('/home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-macaca_mulatta.tsv')
df = df.drop(columns=['target_id'])
matrix4 =  np.asfortranarray(df.values)



In [60]:
U, sigmas, V = ho_SVD([matrix1, matrix2, matrix3, matrix4])
sigmas_matrix = [np.diag(sigma) for sigma in sigmas]


In [65]:
df_human = pd.read_table('../PNAS_2000_datasets/Human.txt', header=1)
df_human.loc[:, '2_hr':].head()

Unnamed: 0,2_hr,4_hr,6_hr,8_hr,10_hr,12_hr,14_hr,16_hr,18_hr,20_hr,22_hr,24_hr,26_hr,28_hr,30_hr,32_hr,34_hr,36_hr
0,1.279,0.962,1.4,1.346,1.18,1.366,1.308,1.498,1.448,1.242,1.547,1.372,0.935,1.479,1.526,Null,1.398,1.533
1,0.912,0.855,1.169,0.939,1.17,1.112,1.195,1.31,1.279,1.165,1.194,1.076,1.142,1.58,1.354,Null,Null,1.222
2,3.737,2.737,3.221,3.187,3.269,2.465,2.13,1.964,1.577,1.522,1.587,1.661,1.405,1.925,1.457,Null,1.563,1.66
3,1.075,1.051,1.255,1.364,1.469,1.305,1.451,1.6,1.385,1.155,1.234,1.239,0.948,1.415,1.43,1.184,1.292,1.384
4,1.595,1.198,1.223,1.359,1.285,1.288,1.201,1.276,1.101,1.143,1.452,Null,1.272,1.51,1.431,Null,Null,1.527


In [66]:
df_human.loc[:, '2_hr':].shape

(12056, 18)

In [63]:
df_yeast = pd.read_table('../PNAS_2000_datasets/Yeast.txt', header=1)
df_yeast.loc[:, '0_min':].head()

Unnamed: 0,0_min,7_min,14_min,21_min,28_min,35_min,42_min,49_min,56_min,63_min,70_min,77_min,84_min,91_min,98_min,105_min,112_min,119_min
0,0.705,0.511,0.482,0.562,0.654,0.656,0.657,0.798,0.485,0.729,0.679,1.006,0.546,1.008,0.83,0.947,0.684,Null
1,0.868,0.49,0.791,0.71,0.67,0.631,0.904,1.024,0.799,1.7,0.726,1.384,0.9,1.555,1.077,1.154,1.049,1.096
2,1.063,0.593,0.757,1.002,0.934,1.149,1.022,1.62,0.861,1.247,0.83,1.082,0.702,1.508,1.043,1.004,0.895,0.964
3,0.926,0.574,0.533,0.848,0.933,0.89,1.049,Null,0.799,1.583,0.607,1.357,0.702,1.327,0.82,1.243,0.989,1.002
4,0.667,0.538,0.695,1.111,0.889,1.355,0.99,1.234,0.655,0.868,0.682,1.348,0.929,1.691,1.192,0.937,1.097,0.812


In [64]:
df_yeast.loc[:, '0_min':].shape

(4523, 18)

In [46]:
print(df_yeast.columns)

Index(['YORF', 'NAME', 'PROCESS', 'FUNCTION', 'MICROARRAY_CLASSIFICATION',
       'TRADITIONAL_CLASSIFICATION', '0_min', '7_min', '14_min', '21_min',
       '28_min', '35_min', '42_min', '49_min', '56_min', '63_min', '70_min',
       '77_min', '84_min', '91_min', '98_min', '105_min', '112_min',
       '119_min'],
      dtype='object')


In [45]:
df_human.shape

(12056, 23)

In [47]:
print(df_human.columns)

Index(['CLID', 'SYMBOL', 'NAME', 'MICROARRAY_CLASSIFICATION',
       'TRADITIONAL_CLASSIFICATION', '2_hr', '4_hr', '6_hr', '8_hr', '10_hr',
       '12_hr', '14_hr', '16_hr', '18_hr', '20_hr', '22_hr', '24_hr', '26_hr',
       '28_hr', '30_hr', '32_hr', '34_hr', '36_hr'],
      dtype='object')


In [55]:
matrix1 = np.loadtxt('../hogsvd-python/A.txt')

params = {
    'K': matrix1.shape[1],
    'lambda1': 0.01,
    'mode': 2,
    'modeD': 0,   
    'posD': True,
    'posAlpha': True,
    'verbose': False
}

"""
 param = {
            'K': 2,
            'lambda1': self.lambda1,
            'mode': 2,
            'modeD': 0,
            'posD': True,
            'posAlpha': True,
            'verbose': False
        }
        stain_matrix = spams.trainDL(OD.T, **param).T
def get_concentrations(self, image, stain_matrix):
        OD = RGB2OD(image).reshape((-1, 3))
        if self.maskout_white:
            nonwhite_mask = get_nonwhite_mask(
                image, self.nonwhite_threshold).reshape((-1, ))
            OD = OD[nonwhite_mask]
        coefs = spams.lasso(
            OD.T, D=stain_matrix.T, mode=2, lambda1=self.lambda1,
            pos=True).toarray().T
        return coefs
"""        
        
matrix1_W = spams.trainDL(np.asfortranarray(matrix1), **params)
matrix1_H = spams.lasso(np.asfortranarray(matrix1), D=matrix1_W,
                        mode=2, lambda1=0.01,
                        pos=True).toarray()



In [56]:
matrix1_W.shape

(5, 5)

In [57]:
matrix1_H.shape

(5, 5)

In [58]:
matrix1_H

array([[0.        , 0.        , 0.        , 1.72398019, 0.        ],
       [0.        , 1.5440279 , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.50001829],
       [1.48127535, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 1.6297156 , 0.        , 0.        ]])

In [33]:
matrix1 = np.loadtxt('../hogsvd-python/A.txt')
matrix2 = np.loadtxt('../hogsvd-python/B.txt')
matrix3 = np.loadtxt('../hogsvd-python/C.txt')


In [34]:
U, sigmas, V = ho_SVD([matrix1, matrix2, matrix3])
sigmas

[array([[3.09280748, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.37274234, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.64861517, 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.99819793, 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.46047133]]),
 array([[0.0496686 , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 2.54977057, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.5225826 , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.51731124, 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.83993335]]),
 array([[1.91481127, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 2.10438386, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.33562861, 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.65995043, 0.        ],
        [0.     

In [28]:
sigmas

[array([[1.88529753, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 1.40617122, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 1.1980833 , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 1.41909373, 0.        ],
        [0.        , 0.        , 0.        , 0.        , 1.53383005]]),
 array([[1.59284129, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.85229232, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 1.42873097, 0.        , 0.        ],
        [0.        , 0.        , 0.        , 1.27658053, 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.9030575 ]]),
 array([[0.7691295 , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 1.38869249, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.88894246, 0.        , 0.        ],
        [0.        , 0.        , 0.        , 1.50545459, 0.        ],
        [0.     

In [17]:
def calcHOGSVD(matList):
    #assert len(matList)==3
    #Step 1: calculate Di^T*Di
    aMat = list()
    sMat = list()
    N = len(matList)
    for mat in matList:
        D = mat
        aMatTemp = np.dot(D.T, D)
        aMat.append(aMatTemp)

    #calculate pairwise Sij sums
    for i in range(len(aMat)):
        for j in range(i + 1, len(aMat)):
            print(i, j)
            sMatTemp = 0.5 * (np.dot(aMat[i], np.linalg.inv(aMat[j])) + np.dot(
                aMat[j], np.linalg.inv(aMat[i])))
            sMat.append(sMatTemp)

    S = np.zeros(sMat[0].shape)

    #sum all Si,j matrices to get S
    for s in sMat:
        S = S + s

    #normalize S
    S = (2.0 / (N * (N - 1))) * S

    #Do the eigen-decomposition on S
    L, V = np.linalg.eig(S)

    #Calculate the Bi matrices
    bMatList = list()
    for mat in matList:
        D = mat
        BT = np.dot(np.linalg.inv(V), D.T)
        bMatList.append(BT.T)

    #from Bi matrices calculate U and Sigma matrices
    sigList = list()
    uMatList = list()

    for B in bMatList:
        sig = np.sqrt(np.sum(B * B, axis=0))
        U = B / sig
        SIG = np.diag(sig)
        sigList.append(SIG)
        uMatList.append(U)

    return (uMatList, sigList, V)

In [19]:
U, sigmas, V =  calcHOGSVD([matrix1, matrix2, matrix3])
sigmas

0 1
0 2
1 2


[array([[3.09280748, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.37274234, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.64861517, 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.46047133, 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.99819793]]),
 array([[0.0496686 , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 2.54977057, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.5225826 , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.83993335, 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.51731124]]),
 array([[1.91481127, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 2.10438386, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.33562861, 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.7093041 , 0.        ],
        [0.     