PCA

In [1]:
# =====================================
# PCA on a Normal Dataset (data.csv)
# =====================================

# Import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# ---------------------------
# 1. LOAD DATA
# ---------------------------
data = pd.read_csv('/content/customers.csv')
print("First 5 rows:")
print(data.head())

# ---------------------------
# 2. DATA CLEANING
# ---------------------------
# Drop duplicates and handle missing values
data = data.drop_duplicates()
data = data.fillna(data.mean())

# ---------------------------
# 3. FEATURE SELECTION
# ---------------------------
# Assuming the last column is the target
X = data.drop('Spending_Score', axis=1)
y = data['Spending_Score']

# ---------------------------
# 4. SCALING (Important!)
# ---------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------------------------
# 5. APPLY PCA
# ---------------------------
pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization
X_pca = pca.fit_transform(X_scaled)

print("\nExplained Variance Ratio:", pca.explained_variance_ratio_)
print("Total Variance Captured:", sum(pca.explained_variance_ratio_))

# ---------------------------
# 6. VISUALIZATION
# ---------------------------
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA on Normal Dataset')
plt.colorbar(label='Target Class')
plt.show()