In [2]:
# Imports
import pandas as pd
import numpy as np
from warnings import filterwarnings

# Disable warnings from being printed
filterwarnings('ignore')

In [3]:
fileN = 800
fileM = 100000

def read_data(filename):
    data = pd.DataFrame(columns=range(fileM))
    with open(filename, 'r') as datafile:
        lines = datafile.readlines()
        for i in range(len(lines)):
            record = np.fromstring(lines[i], dtype=int, sep=' ')
            record_bool = [0 for i in range(fileM)]
            for col in record:
                record_bool[col-1] = 1
            data.loc[i] = record_bool
    return data

def read_labels(filename):
    labels = []
    with open(filename, 'r') as datafile:
        lines = datafile.readlines()
        for line in lines:
            labels.append(np.fromstring(line[0], dtype=int, sep=' '))
    return labels

# Read the data into dataframe
train_data = read_data("dorothea/dorothea_train.data")

# Get the labels of the train data
train_data_labels = read_labels("dorothea/dorothea_train.labels")

In [10]:
# Compute data which is constant in different runs pf pca
# Center the data around mean
data_centered = train_data - np.mean(train_data, axis=0)

# Compute the covariance matrix (xx' i.e nXn), and find eigenvalues and eigenvectors
cov_matrix = data_centered.transpose().cov()
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Now eigenvectors of x'x matrix can be obtained from these by multiplying by x', eigenvalues remain same
eigenvectors = np.dot(np.transpose(data_centered), eigenvectors)

# Sort the eigenvectors in decreasing order of eigenvalues
sort_order = np.argsort(eigenvalues)[::-1]
new_eigenvectors = np.zeros(eigenvectors.shape)
for i in range(eigenvalues.shape[0]):
    new_eigenvectors[:, i] = eigenvectors[:, sort_order[i]]

In [17]:
# Get data in the new feature space of reduced dimensionality.
def pca_(k):
    # Get first K eigenvectors
    k = 100
    eigenpairs_firstK = new_eigenvectors[:, :k]
    
    # Get data in reduced dimension space
    projected_data = np.dot(data_centered, eigenpairs_firstK)
    
    return projected_data

In [22]:
projected_data

array([[ -6.32850268e+01,  -5.43051881e+01,  -4.80361645e+01, ...,
         -8.80464163e+00,  -1.50051721e+00,  -3.21771842e+01],
       [ -9.25665240e+01,  -3.30053539e+01,  -6.52298515e+01, ...,
          2.19743468e-01,  -1.29216052e+01,   2.59250114e+01],
       [ -9.35394659e+01,  -4.57370007e+01,  -6.79464162e+01, ...,
         -1.80609752e+00,   6.16340601e+00,  -5.43663162e-01],
       ..., 
       [  5.79674847e+02,  -1.86917856e+02,   2.79890305e+02, ...,
          9.16322309e+01,  -6.95559640e+01,  -6.61201140e+01],
       [ -9.24260720e+01,  -2.29182922e+01,  -6.22005122e+01, ...,
         -1.29809935e+01,  -8.96329587e-02,  -1.63433875e+01],
       [ -1.11818493e+02,  -3.62876092e+01,  -4.90356842e+01, ...,
         -3.59491227e+01,   9.69300213e+00,  -1.75719073e+01]])