<a href="https://colab.research.google.com/github/praveendhac/aiml-iisc-assignments/blob/main/assignment2/120261627_pca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Principal Component Analysis

In [13]:
import pandas as pd
from scipy import stats
import missingno as msno
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [7]:
df_dataset = pd.read_csv('Dataset.csv')
print('Shape:', df_dataset.shape)
print('Head:', df_dataset.head().to_string())
print('Info: ', df_dataset.info())
print('Describe: ', df_dataset.describe)
print('Columns: ', df_dataset.columns)
print('Total Columns: ', len(df_dataset.columns))

Shape: (21263, 82)
Head:    number_of_elements  mean_atomic_mass  wtd_mean_atomic_mass  gmean_atomic_mass  wtd_gmean_atomic_mass  entropy_atomic_mass  wtd_entropy_atomic_mass  range_atomic_mass  wtd_range_atomic_mass  std_atomic_mass  wtd_std_atomic_mass  mean_fie  wtd_mean_fie   gmean_fie  wtd_gmean_fie  entropy_fie  wtd_entropy_fie  range_fie  wtd_range_fie     std_fie  wtd_std_fie  mean_atomic_radius  wtd_mean_atomic_radius  gmean_atomic_radius  wtd_gmean_atomic_radius  entropy_atomic_radius  wtd_entropy_atomic_radius  range_atomic_radius  wtd_range_atomic_radius  std_atomic_radius  wtd_std_atomic_radius  mean_Density  wtd_mean_Density  gmean_Density  wtd_gmean_Density  entropy_Density  wtd_entropy_Density  range_Density  wtd_range_Density  std_Density  wtd_std_Density  mean_ElectronAffinity  wtd_mean_ElectronAffinity  gmean_ElectronAffinity  wtd_gmean_ElectronAffinity  entropy_ElectronAffinity  wtd_entropy_ElectronAffinity  range_ElectronAffinity  wtd_range_ElectronAffinity  std_El

StandardScaler() performs Z-score normalization. For every data point in a column, it applies the following formula

x = (x- μ) / σ

- x  is the original value
- μ  is the mean of the column
- σ is the standard deviation.

After this transformation, every column in your dataset will have a mean of 0 and a standard deviation of 1. If you don't use StandardScaler, PCA will be biased toward variables with larger raw numbers (e.g., "Income in Dollars" vs "Age in Years"), giving you an incorrect component count.

In [10]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df_dataset)

# Initialize PCA with the variance threshold
pca = PCA(n_components=0.90)
pca.fit(scaled_df)

# Q. How many principal components are required to explain 90% of the total variance?
print(f"Number of components to explain 90% variance: {pca.n_components_}")

Number of components to explain 90% variance: 12


In [19]:
# Keep only numeric columns
X = df_dataset.select_dtypes(include='number').dropna()

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit PCA
pca = PCA()
pca.fit(X_scaled)

# Cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Number of components needed for 90% variance
n_components_90 = np.argmax(cumulative_variance >= 0.90) + 1

print("Number of components explaining 90% variance:", n_components_90)

Number of components explaining 90% variance: 12


In [16]:
# Q. If you reduce the dataset from 'n' features to 10 principal components, what percentage of variance is lost?
scaled_data = scaler.fit_transform(scaled_df)

pca_10 = PCA(n_components=10)
pca_10.fit(scaled_data)

print("explained_variance_ratio_: ", pca_10.explained_variance_ratio_, "len:", len(pca_10.explained_variance_ratio_))
# Calculate variance captured by the 10 components
variance_retained = np.sum(pca_10.explained_variance_ratio_) * 100

# Calculate variance lost
variance_lost = 100 - variance_retained

print(f"Variance Retained: {variance_retained:.2f}%")
print(f"Variance Lost: {variance_lost:.2f}%")


explained_variance_ratio_:  [0.38992711 0.10393016 0.09452416 0.07822089 0.05845559 0.03784435
 0.03579677 0.03070171 0.02341304 0.01953779] len: 10
Variance Retained: 87.24%
Variance Lost: 12.76%


In [17]:
# Q. What are the eigenvalues corresponding to the first 5 Principal Components (PCs) obtained from the PCA analysis?

# 2. Standardize the data
# This is required to calculate eigenvalues from a correlation-like matrix
scaler = StandardScaler()
scaled_df5 = scaler.fit_transform(df_dataset)

# 3. Fit PCA
# We ask for 5 components to see what happens to the 5th one
pca = PCA(n_components=5)
pca.fit(scaled_df5)

# 4. Extract Eigenvalues
# In scikit-learn, 'explained_variance_' represents the eigenvalues
eigenvalues = pca.explained_variance_
print("Eigen Values: ", eigenvalues)

# 5. Display results
print("Eigenvalues for the first 5 Principal Components:")
for i, eig in enumerate(eigenvalues):
    print(f"PC{i+1}: {eig:.4f}")

Eigen Values:  [31.97552647  8.52267362  7.7513456   6.41441455  4.79358382]
Eigenvalues for the first 5 Principal Components:
PC1: 31.9755
PC2: 8.5227
PC3: 7.7513
PC4: 6.4144
PC5: 4.7936


In [21]:

# Keep only numeric columns
X5 = df_dataset.select_dtypes(include='number').dropna()

# Standardize the data
scaler = StandardScaler()
X5_scaled = scaler.fit_transform(X5)

# Fit PCA
pca5 = PCA(n_components=5)
pca5.fit(X5_scaled)

# Cumulative explained variance
eigenvalues5 = pca5.explained_variance_

print("Five Eigen Values:", eigenvalues5)

Five Eigen Values: [31.97552647  8.52267362  7.7513456   6.41441455  4.79358382]


In [18]:
# Q. What is the percentage of variance explained by the first two principal components?

scaler = StandardScaler()
scaled_df2 = scaler.fit_transform(df_dataset)

# 3. Fit PCA
# We ask for 2 components
pca = PCA(n_components=2)
pca.fit(scaled_df2)

# Extract Explained Variance Ratios
vratios = pca.explained_variance_ratio_
pc1_var = vratios[0] * 100
pc2_var = vratios[1] * 100
total_2_pc = np.sum(vratios) * 100

print(f"Variance explained by PC1: {pc1_var:.2f}%")
print(f"Variance explained by PC2: {pc2_var:.2f}%")
print(f"Total variance explained by first two PCs: {total_2_pc:.2f}%")

Variance explained by PC1: 38.99%
Variance explained by PC2: 10.39%
Total variance explained by first two PCs: 49.39%
