In [1]:
def parse_arff(file_path):
    data_started = False
    attributes = []
    data = []
    nominal_mapping = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue  # Skip comments and empty lines

            if line.lower().startswith('@relation'):
                continue  # Skip relation information

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()

                if '{' in line:  # Nominal data
                    values = line[line.index('{') + 1:line.index('}')].split(',')
                    attributes.append((attr_name, 'nominal', values))
                    # Sample attribute information
                    attribute_info = ((attr_name, 'nominal', values))
                    
                    # Extract relevant information
                    attr_name, attr_type, attr_values = attribute_info

                    #Create a mapping dictionary
                    nominal_mapping.append( {value: index for index, value in enumerate(attr_values)} )


                else:  # Numeric data
                    attributes.append((attr_name, 'numeric', 0))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data_line = line.split(',')
                
                data.append(line.split(','))

    return attributes, data


file_path = '2017.arff'
attributes, data = parse_arff(file_path)

# print("Attributes:")
# for attr in attributes:
#     print(attr)
print("len = ", len(attributes))
print("\nData:")
# for kek in data :
#     print ()
for row in data :
    for i in range ( len (attributes) ):
        attr_name, attr_type, attr_values = attributes[i]
        if attr_type == 'nominal' :
            # Create a mapping dictionary
            nominal_mapping = {value: index for index, value in enumerate(attr_values)}
            row[i] = nominal_mapping.get(row[i])
        elif attr_type == 'numeric' :
            try :
                row[i] = float(row[i])
            except Exception as e:
                # Handle the exception
                row[i] = -1

print (data)
    
# data[0][0] = int(data[0][0])
# print(data[0][0]+10)
import numpy as np
def transpose(matrix):
    return [[matrix[j][i] for j in range(len(matrix))] for i in range(len(matrix[0]))]

def dot_product(matrix1, matrix2):
    return [[sum(a * b for a, b in zip(row, col)) for col in zip(*matrix2)] for row in matrix1]

def scalar_multiply(matrix, scalar):
    return [[val * scalar for val in row] for row in matrix]

def matrix_subtract(matrix1, matrix2):
    return [[a - b for a, b in zip(row1, row2)] for row1, row2 in zip(matrix1, matrix2)]

def center_data(X):
    num_rows, num_cols = len(X), len(X[0])
    mean = [sum(X[i][j] for i in range(num_rows)) / num_rows for j in range(num_cols)]
    centered_data = matrix_subtract(X, [mean])
    return centered_data, mean

def pca(X, num_components):
    centered_data, mean = center_data(X)

    # Calculate the covariance matrix manually
    transposed_data = transpose(centered_data)
    covariance_matrix = dot_product(transposed_data, centered_data)
    covariance_matrix = scalar_multiply(covariance_matrix, 1 / (len(centered_data[0]) - 1))

    # Perform eigendecomposition on the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = sorted(range(len(eigenvalues)), key=lambda k: eigenvalues[k], reverse=True)
    eigenvalues = [eigenvalues[i] for i in sorted_indices]
    eigenvectors = [eigenvectors[:, i] for i in sorted_indices]

    # Select the top k eigenvectors
    principal_components = eigenvectors[:num_components]

    # Project the data onto the principal components
    pca_result = dot_product(centered_data, transpose(principal_components))

    return pca_result, eigenvalues, principal_components, mean

# Assuming you have a dataset 'data' already parsed

# Number of principal components or singular values to retain
num_components = 8

# Perform PCA
pca_result, _, _, _ = pca(data, num_components)

# Print results
print("PCA Result:")
print(pca_result)

def svd(matrix):
    num_rows, num_cols = len(matrix), len(matrix[0])

    # Step 1: Compute A^T * A and A * A^T
    ata = dot_product(transpose(matrix), matrix)
    aat = dot_product(matrix, transpose(matrix))

    # Step 2: Compute eigenvalues and eigenvectors for A^T * A
    eigenvalues_ata, eigenvectors_ata = np.linalg.eig(ata)

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues_ata)[::-1]
    eigenvalues_ata = eigenvalues_ata[sorted_indices]
    eigenvectors_ata = eigenvectors_ata[:, sorted_indices]

    # Step 3: Compute singular values and sort them
    singular_values = np.sqrt(eigenvalues_ata)

    # Step 4: Compute right singular vectors
    v_t = eigenvectors_ata

    # Step 5: Compute left singular vectors
    u = []
    for i in range(num_rows):
        u_i = dot_product(matrix, [v_t[j][i] / singular_values[j] for j in range(num_cols)])
        u.append(u_i)
    u = np.array(u).T

    return u, singular_values, v_t


len =  85

Data:
[[10.0, 0, 0.14, 0.53, 0.19, 1.41, 0.33, 0.14, 0.89, 1.08, 0.47, 0.17, 0.31, 0.13, 0.15, 0.27, 1.89, 0.26, 0.13, 0.14, 0.13, 0.14, 0.27, -4.59, 0.52, 6.24, -0.01, 0.3, 2.15, 1.86, 0.0, 1.12, 8.71, 0.54, 0.0, 1.25, 0.13, 9.26, 1.37, 0.17, 0.16, 1.22, 0.46, 1.32, 1.52, 327846.0, 0.3, 0.15, 71.43, 18.38, 2.37, 3.04, 0.17, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0, 1.0, 1.0, 1.0, 0, 0, 1.0], [22.0, 1, 0.01, 0.5, 0.07, 1.4, 0.06, 0.03, 1.01, 0.65, 0.5, 0.02, 0.18, 0.05, 0.03, 0.07, 2.01, 0.04, 0.05, 0.02, 0.02, 0.44, 0.02, 2.6, 0.1, 6.28, 0.65, 0.04, 3.48, 1.28, 0.0, 0.66, 0.74, 0.82, 0.0, 0.67, 0.03, 0.51, 1.27, 0.1, 0.16, 0.51, 0.18, 0.67, 1.1, 138179.7, 0.02, 0.63, 29.12, 5.93, 3.56, 0.87, 0.08, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 1, 0.0, 0.0, 1, 1, 1, 1, 740.13, 11412.92, 3674.22, 17281.89, 371.62, 498.88, 1.0, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 0, 1, 1.0], [27.0, 0, 0.03, 0.74, 0.01, 1.02, 0.0, 0.03, 

In [2]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.datasets import load_iris

# Load the iris dataset as an example
iris = load_iris()
X = iris.data

# Perform PCA using scikit-learn
pca_sklearn = PCA(n_components=2)
X_pca_sklearn = pca_sklearn.fit_transform(X)

# Perform SVD using scikit-learn
svd_sklearn = TruncatedSVD(n_components=2)
X_svd_sklearn = svd_sklearn.fit_transform(X)

# Display the scikit-learn results
print("PCA scikit-learn result:")
print(X_pca_sklearn)

print("\nSVD scikit-learn result:")
print(X_svd_sklearn)

PCA scikit-learn result:
[[-2.68412563  0.31939725]
 [-2.71414169 -0.17700123]
 [-2.88899057 -0.14494943]
 [-2.74534286 -0.31829898]
 [-2.72871654  0.32675451]
 [-2.28085963  0.74133045]
 [-2.82053775 -0.08946138]
 [-2.62614497  0.16338496]
 [-2.88638273 -0.57831175]
 [-2.6727558  -0.11377425]
 [-2.50694709  0.6450689 ]
 [-2.61275523  0.01472994]
 [-2.78610927 -0.235112  ]
 [-3.22380374 -0.51139459]
 [-2.64475039  1.17876464]
 [-2.38603903  1.33806233]
 [-2.62352788  0.81067951]
 [-2.64829671  0.31184914]
 [-2.19982032  0.87283904]
 [-2.5879864   0.51356031]
 [-2.31025622  0.39134594]
 [-2.54370523  0.43299606]
 [-3.21593942  0.13346807]
 [-2.30273318  0.09870885]
 [-2.35575405 -0.03728186]
 [-2.50666891 -0.14601688]
 [-2.46882007  0.13095149]
 [-2.56231991  0.36771886]
 [-2.63953472  0.31203998]
 [-2.63198939 -0.19696122]
 [-2.58739848 -0.20431849]
 [-2.4099325   0.41092426]
 [-2.64886233  0.81336382]
 [-2.59873675  1.09314576]
 [-2.63692688 -0.12132235]
 [-2.86624165  0.06936447]
 [-

In [None]:
The results from the program being made from scratch differ from the results made with the sklearn library.
It is because the implementations of PCA and SVD in sklearn are highly optimized and utilize efficient 
algorithms for matrix decomposition and eigenvector calculations. Furthermore, sklearn uses high precision 
arithmetic and rounding techniques to ensure consistent results, while the implementation from scratch 
may use different precision levels. When implementing PCA and/or SVD from scratch, we may not have access 
to those optimized algorithms, leading to numerical differences in the results.