In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [2]:
# 1. Data: Generate random data
# --------------------------
np.random.seed(42)
# Create a dataset with 100 samples and 5 features
data = np.random.rand(100, 5)
df = pd.DataFrame(data, columns=[f"feature_{i}" for i in range(1, 6)])
print("Sample Data:\n", df.head())

Sample Data:
    feature_1  feature_2  feature_3  feature_4  feature_5
0   0.374540   0.950714   0.731994   0.598658   0.156019
1   0.155995   0.058084   0.866176   0.601115   0.708073
2   0.020584   0.969910   0.832443   0.212339   0.181825
3   0.183405   0.304242   0.524756   0.431945   0.291229
4   0.611853   0.139494   0.292145   0.366362   0.456070


In [3]:
# 2. Preprocessing: Standardize the data
# --------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

In [4]:
# 3. Train-Test Split: Split the data into training and testing sets
# --------------------------
X_train, X_test = train_test_split(scaled_data, test_size=0.2, random_state=42)
print("\nTrain set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Train set shape: (80, 5)
Test set shape: (20, 5)


In [5]:
# 4. Train Base Model (PCA): Fit PCA on the training data
# --------------------------
# Let's reduce to 2 principal components
n_components = 2
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)
print("\nExplained Variance Ratio:", pca.explained_variance_ratio_)


Explained Variance Ratio: [0.28687079 0.22520992]


In [6]:
# 5. Evaluate: Transform test data and calculate reconstruction error
# --------------------------
X_test_pca = pca.transform(X_test)
# Reconstruct the test data from the PCA components
X_test_reconstructed = pca.inverse_transform(X_test_pca)
# Compute the mean squared reconstruction error
reconstruction_error = np.mean((X_test - X_test_reconstructed) ** 2)
print("\nReconstruction Error on Test Set:", reconstruction_error)


Reconstruction Error on Test Set: 0.43472748733459565


In [8]:
# 6. Predict New Data: Apply PCA to new random data
# --------------------------
# Create the new data as a DataFrame with the same feature names as the original data
new_data_df = pd.DataFrame(new_data, columns=[f"feature_{i}" for i in range(1, 6)])
# Standardize the new data
new_data_scaled = scaler.transform(new_data_df)
# Apply PCA transformation
new_data_pca = pca.transform(new_data_scaled)
print("\nNew Data (PCA-transformed):\n", new_data_pca)


New Data (PCA-transformed):
 [[ 1.38263021  0.31812199]
 [-0.39891812  1.97544428]
 [-0.21922292 -0.47125882]
 [-0.50097225  0.90058394]
 [ 0.51726726  1.51414827]]
