# CS 3101: Pre Finals
### Sheena Stella A. Salde

In [9]:
# Function to load ARFF file into a list of dictionaries
def load_arff(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    data_start = lines.index('@data\n') + 1
    attributes = [line.split()[1] for line in lines if line.startswith('@attribute')]
    data_list = []

    for line in lines[data_start:]:
        values = line.strip().split(',')
        data_dict = {attr: val if val != 'm' else None for attr, val in zip(attributes, values)}
        data_list.append(data_dict)

    return data_list

def preprocess_data(data_list):
    for entry in data_list:
        for key, value in entry.items():
            if value == 'm':
                entry[key] = None

    return data_list

def linear_interpolation(data_list):
    for i in range(1, len(data_list) - 1):
        for key, value in data_list[i].items():
            if value is None and data_list[i - 1][key] is not None and data_list[i + 1][key] is not None:
                data_list[i][key] = (float(data_list[i - 1][key]) + float(data_list[i + 1][key])) / 2

    return data_list

def z_score_standardization(matrix):
    for i in range(2, len(matrix[0])):
        column = [float(row[i]) for row in matrix if row[i] is not None and row[i] != 'm']

        if len(set(column)) == 1:
            continue

        mean_val = sum(column) / len(column)

        if len(column) > 1:
            std_dev = (sum((x - mean_val) ** 2 for x in column) / len(column)) ** 0.5
        else:
            std_dev = 0

        for row in matrix:
            if row[i] is not None and row[i] != 'm':
                if std_dev != 0:
                    row[i] = (float(row[i]) - mean_val) / std_dev
                else:
                    row[i] = 0  

    return matrix

def dot_product(v1, v2):
    result = sum(x * y for x, y in zip(v1, v2) if isinstance(x, (int, float)) and isinstance(y, (int, float)))
    return result

def subtract(v1, v2):
    return [x - y for x, y in zip(v1, v2)]

def scale(vector, scalar):
    return [x * scalar for x in vector]

def multiply_matrix_vector(matrix, vector):
    return [dot_product(row, vector) for row in matrix]

def multiply_matrix(matrix1, matrix2):
    result = []
    for row in matrix1:
        new_row = []
        for col in transpose(matrix2):
            element = dot_product(row, col)
            new_row.append(element)
        result.append(new_row)
    return result

def transpose(matrix):
    return [[row[i] for row in matrix] for i in range(len(matrix[0]))]

def mean(column):
    values = [float(val) for val in column if val is not None]
    return sum(values) / len(values) if values else 0

def covariance_matrix(matrix):
    n = len(matrix)
    num_features = len(matrix[0])
    transposed_matrix = transpose(matrix)
    cov_matrix = [[0] * num_features for _ in range(num_features)]

    for i in range(num_features):
        for j in range(num_features):
            mean_i = mean(matrix[i])
            mean_j = mean(matrix[j])
            values_i = [float(val) for val in matrix[i] if val is not None]
            values_j = [float(val) for val in matrix[j] if val is not None]
            cov_matrix[i][j] = sum((val_i - mean_i) * (val_j - mean_j) for val_i, val_j in zip(values_i, values_j)) / (n - 1)

    return cov_matrix

def custom_random():
    seed = 1
    while True:
        seed = (seed * 1103515245 + 12345) & 0x7FFFFFFF
        yield seed / 0x7FFFFFFF


# Function to perform PCA
def pca(data_matrix, num_components):
    # Calculate the covariance matrix
    cov_matrix = covariance_matrix(data_matrix)

    # Calculate the eigenvalues and eigenvectors using power iteration
    num_features = len(data_matrix[0])
    eigenvalues = [0] * num_features
    eigenvectors = [[0] * num_features for _ in range(num_features)]

    random_generator = custom_random()

    for i in range(num_features):
        # Use a simple random number generator as the initial approximation
        vector = [next(random_generator) for _ in range(num_features)]

        for _ in range(1000):  # Adjust the number of iterations as needed
            new_vector = multiply_matrix_vector(cov_matrix, vector)
            magnitude = sum(x ** 2 for x in new_vector) ** 0.5
            vector = scale(new_vector, 1 / magnitude)

        eigenvalues[i] = dot_product(new_vector, vector)
        eigenvectors[i] = vector

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = sorted(range(num_features), key=lambda k: eigenvalues[k], reverse=True)
    eigenvalues = [eigenvalues[i] for i in sorted_indices]
    eigenvectors = [[eigenvectors[j][i] for j in sorted_indices] for i in range(num_features)]

    # Select the top 'num_components' eigenvectors
    top_eigenvectors = eigenvectors[:num_components]

    # Project the data onto the new subspace defined by the top eigenvectors
    pca_result = multiply_matrix(data_matrix, transpose(top_eigenvectors))

    return pca_result



def display_data_table(data_list):
    # Get the attribute names
    attributes = list(data_list[0].keys())

    # Calculate the maximum width for each column
    column_widths = {attr: max(len(attr), max(len(str(entry[attr])) for entry in data_list)) for attr in attributes}

    # Print header
    header = "|".join(f"{attr:^{column_widths[attr]}}" for attr in attributes)
    print(header)
    print("-" * sum(column_widths.values()))

    # Print data rows
    for entry in data_list:
        row = "|".join(f"{str(entry[attr]):^{column_widths[attr]}}" if entry[attr] is not None else 'm' for attr in attributes)
        print(row)


def display_matrix(matrix):
    for row in matrix:
        print("|".join(f"{str(cell):^10}" for cell in row))


# load and preprocess data for each year
file_path_2017 = r'.\datas\2017.arff' 
file_path_2018 = r'.\datas\2018.arff'
file_path_2019 = r'.\datas\2019.arff'
file_path_2020 = r'.\datas\2020.arff'
file_path_2021 = r'.\datas\2021 Q1.arff'

data_2017 = load_arff(file_path_2017)
data_2018 = load_arff(file_path_2018)
data_2019 = load_arff(file_path_2019)
data_2020 = load_arff(file_path_2020)
data_2021 = load_arff(file_path_2021)

data_2017_preprocessed = preprocess_data(data_2017)
data_2018_preprocessed = preprocess_data(data_2018)
data_2019_preprocessed = preprocess_data(data_2019)
data_2020_preprocessed = preprocess_data(data_2020)
data_2021_preprocessed = preprocess_data(data_2021)

# perform linear interpolation for each year
data_2017_preprocessed = linear_interpolation(data_2017_preprocessed)
data_2018_preprocessed = linear_interpolation(data_2018_preprocessed)
data_2019_preprocessed = linear_interpolation(data_2019_preprocessed)
data_2020_preprocessed = linear_interpolation(data_2020_preprocessed)
data_2021_preprocessed = linear_interpolation(data_2021_preprocessed)


# Combine data from all quarters
data_combined = data_2017_preprocessed + data_2018_preprocessed + data_2019_preprocessed + data_2020_preprocessed + data_2021_preprocessed
print(type(data_combined))


# Extract attributes
attributes = list(data_combined[0].keys())

# Initialize a list to represent the matrix
matrix = []

# matrix conversion
for entry in data_combined:
    row = [entry[attr] for attr in attributes[2:]] # excludes 'year' and 'quarter' attributes
    matrix.append(row)

# display_matrix(matrix)

# perform data standardization before doing PCA & SVD 
standardized_data = z_score_standardization(matrix)
display_matrix(standardized_data)

num_components = 2  # Set the desired number of principal components
pca_result = pca(standardized_data, num_components) # perform PCA

print("\n\nPCA Result:")
display_matrix(pca_result)

<class 'list'>
   0.14   |   0.53   |0.003259382386424055|-0.06252327663467024|-0.020881677680633433|0.022734887627655645|-0.06936234604686263|-0.021460625441756115|-0.021453892997257893|0.022749673598383917|0.022087332575265496|0.021161474760888747|0.07091888000841777|0.021723614943488276|-0.0673037991176156|0.02237452472872079|0.023383172367977618|0.022748493031902906|0.008821507983960707|-0.02146877234809736|0.00845946899201447|-0.028042159572898264|0.010508989247314674|1.3848389024033827|-0.030875230736728366|-0.03393433396433728|-0.024818714881902102|-0.021969014055400558|-0.5009008133044031|-0.021460683374329582|-0.07985577848437338|-0.021456890346374668|-0.8150940340438491|-0.013239293762316891|0.027529848549243174|0.05096137360720327|-0.04531879613888555|-0.021182030685392667|0.0242133079608533|-0.05076359518698327|-0.022687922991604104|0.011853753066978995|0.010978544785754352|2.1530090937779844|-0.02783723602316123|-0.07808046100377672|0.16728291163605796|0.3744852090722345|-

2 minutes 5.8 seconds



In [25]:
import numpy as np
from sklearn.decomposition import PCA

# Function to load ARFF file into a list of dictionaries
def load_arff(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    data_start = lines.index('@data\n') + 1
    attributes = [line.split()[1] for line in lines if line.startswith('@attribute')]
    data_list = []

    for line in lines[data_start:]:
        values = line.strip().split(',')
        data_dict = {attr: val if val != 'm' else None for attr, val in zip(attributes, values)}
        data_list.append(data_dict)

    return data_list

def preprocess_data(data_list):
    for entry in data_list:
        for key, value in entry.items():
            if value == 'm':
                entry[key] = None

    return data_list

def linear_interpolation(data_list):
    for i in range(1, len(data_list) - 1):
        for key, value in data_list[i].items():
            if value is None and data_list[i - 1][key] is not None and data_list[i + 1][key] is not None:
                data_list[i][key] = (float(data_list[i - 1][key]) + float(data_list[i + 1][key])) / 2

    return data_list

# ... (other functions)

# Assuming standardized_data is already loaded and standardized
# Ensure that it's a numpy array for compatibility with scikit-learn
standardized_data_np = np.array(standardized_data)

# Set the desired number of principal components
num_components = 2

# Create a PCA object
pca_sklearn = PCA(n_components=num_components)

# Fit the PCA model and transform the data
pca_result_sklearn = pca_sklearn.fit_transform(standardized_data_np)

# Display the result using your custom display_matrix function
print("\nPCA Result (Scikit-Learn):")
display_matrix(pca_result_sklearn)

# Compare results
if np.allclose(pca_result, pca_result_sklearn, rtol=1e-5, atol=1e-8):
    print("\nResults are similar!")
else:
    print("\nResults are dissimilar!")

# Load and preprocess data for each year
file_path_2017 = r'.\datas\2017.arff'
file_path_2018 = r'.\datas\2018.arff'
file_path_2019 = r'.\datas\2019.arff'
file_path_2020 = r'.\datas\2020.arff'
file_path_2021 = r'.\datas\2021 Q1.arff'

data_2017 = load_arff(file_path_2017)
data_2018 = load_arff(file_path_2018)
data_2019 = load_arff(file_path_2019)
data_2020 = load_arff(file_path_2020)
data_2021 = load_arff(file_path_2021)

data_2017_preprocessed = preprocess_data(data_2017)
data_2018_preprocessed = preprocess_data(data_2018)
data_2019_preprocessed = preprocess_data(data_2019)
data_2020_preprocessed = preprocess_data(data_2020)
data_2021_preprocessed = preprocess_data(data_2021)

# Perform linear interpolation for each year
data_2017_preprocessed = linear_interpolation(data_2017_preprocessed)
data_2018_preprocessed = linear_interpolation(data_2018_preprocessed)
data_2019_preprocessed = linear_interpolation(data_2019_preprocessed)
data_2020_preprocessed = linear_interpolation(data_2020_preprocessed)
data_2021_preprocessed = linear_interpolation(data_2021_preprocessed)

# Combine data from all quarters
data_combined = data_2017_preprocessed + data_2018_preprocessed + data_2019_preprocessed + data_2020_preprocessed + data_2021_preprocessed
print(type(data_combined))

# Display the combined data using your custom display_data_table function
print("\nCombined Data:")
display_data_table(data_combined)


ModuleNotFoundError: No module named 'sklearn'