## Data Visualization Course - HCMUS
### Personal Lap 01 - Principle Components Analysis Visualization
##### Oct 23, 2024.
#### Class: 22_21
#### Student ID: 22120412.
#### Student Name: Nguyen Anh Tuong

### Load the libraries

In [1]:
#Load the library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn libaries
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_sample_image

### Section 1: **Study about PCA:** motivation, problem statement, PCA algorithms, numerical demo with Python.

In [None]:

#Demo numerical 

np.random.seed(0)

data = np.random.rand(10, 15)

columns = ['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5','Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10','Feature_11', 'Feature_12', 'Feature_13', 'Feature_14', 'Feature_15']
df = pd.DataFrame(data, columns=columns)

# Standardize
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Apply PCA - using Scikit learn
pca = PCA(n_components=5)
pca_data = pca.fit_transform(scaled_data)

pca_df = pd.DataFrame(pca_data, columns=['Principal Component 1', 'Principal Component 2','Principal Component 3', 'Principal Component 4','Principal Component 5'])

# Concatenate the original data with the PCA results
original_and_pca_df = pd.concat([df, pca_df], axis=1)

print(original_and_pca_df)

original_and_pca_df.to_csv("output.csv",sep= '\t', header=True)


In [None]:
#Read the dataset
def read_wine_csv(file_path):
    dataset = pd.read_csv(file_path, delimiter=',', header=0)
    
    print(dataset.head())
    
    return dataset

file_path = 'Wine.csv'

wine_dataset = read_wine_csv(file_path)

### Section 3: Appling PCA on my choose dataset.

In [None]:
#Wine Dataset

X = wine_dataset.drop('Customer_Segment', axis=1)  # Data
y = wine_dataset['Customer_Segment']  # Label

#PCA Processing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print(X_pca[:50]) 

#Visualization
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA on Wine Dataset')
plt.show()

### SubSection: Implementing from Scratch with Numpy.

In [None]:
# Input processing.
X = wine_dataset.drop('Customer_Segment', axis=1) # Data.
y = wine_dataset['Customer_Segment'] # label.

#Standardlize the mean.
X_mean = np.sum(X, axis=0) / X.shape[0]
X_mean_standardized = X - X_mean

#Standarlize the variance.
X_std = X_mean_standardized / np.std(X, axis = 0)

#find covariance matrix.
cov_matrix = np.dot(X_std.T, X_std) / ( X_std.shape[0] - 1)

#find eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

#Arrange descending to get the maximum first.
desc_indexing = np.argsort(eigenvalues)[::-1]
desc_eigenvalues = eigenvalues[desc_indexing]
desc_eigenvectors = eigenvectors[:, desc_indexing]

# I got 2 first principle components
needed_eignvectors = desc_eigenvectors[:, : 2]
X_pca = np.dot(X_std, needed_eignvectors)

#Show 50 first rows
print(X_pca[:50])

#Visualize
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA on Wine Dataset')
plt.show()

### Section 4: Study its application in data preprocessing (e.g., handling missing values)

In [None]:
#Flower.jpg

flower = load_sample_image("flower.jpg")  
gray_flower = np.mean(flower, axis=2)  

plt.figure(figsize=(6,6))
plt.imshow(gray_flower, cmap='gray')
plt.title("Ảnh gốc")
plt.show()

pca = PCA(50)
transformed_data = pca.fit_transform(gray_flower)

reconstructed_image = pca.inverse_transform(transformed_data)

plt.figure(figsize=(6,6))
plt.imshow(reconstructed_image, cmap='gray')
plt.title("Ảnh sau khi nén bằng PCA")
plt.show()

#------------------------------------------------------------

#china.jpg


china = load_sample_image("china.jpg")  
gray_china = np.mean(china, axis=2)  

plt.figure(figsize=(6,6))
plt.imshow(gray_china, cmap='gray')
plt.title("Ảnh gốc")
plt.show()

pca = PCA(50)
transformed_data_china = pca.fit_transform(gray_china)

reconstructed_image_china = pca.inverse_transform(transformed_data_china)

plt.figure(figsize=(6,6))
plt.imshow(reconstructed_image_china, cmap='gray')
plt.title("Ảnh sau khi nén bằng PCA")
plt.show()