In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("./PCA_Dataset_Iris_Flowers.csv")
df.head()

In [None]:
# spliting the data into input features and labels
X = df.drop(['species'], axis=1) # input features
y = df['species'] # labels

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize the input features
X_scaled = StandardScaler().fit_transform(X)
X_scaled[:5]


In [None]:
# Calculate the covariance matrix 
# Covariance matrix: C = E[(X - E[X])(X - E[X])^T]
features = X_scaled.T  # transpose the features to get shape (n_features, n_samples)
cov_matrix = np.cov(features)

cov_matrix[:5]

In [None]:
# Calculate the eigenvalues and eigenvectors of the covariance matrix
values, vectors = np.linalg.eig(cov_matrix)

print(values[:5])
print(vectors[:5])

In [None]:
# explained variance of a feature is a measure of how much of the total variance in the data is explained by that feature. 
# It is calculated as the ratio of the variance of the feature to the total variance of the data, multiplied by 100.

explained_variances = []
for i in range(len(values)):
    explained_variances.append((values[i] / np.sum(values))*100)

print("explained varience of each feature : ", explained_variances)

In [None]:
import matplotlib.pyplot as plt

# bar plot of the explained variance of each feature. The x-axis of the plot represents the dimensions (i.e., features), and the y-axis represents the percentage of explained variance.

# The bar function from Matplotlib's pyplot module is used to create the bar plot. The range(4) argument specifies the x-coordinates of the bars, and the explained_variances argument specifies the heights of the bars. The alpha parameter specifies the transparency of the bars.

# The ylabel and xlabel functions are used to label the y-axis and x-axis of the plot, respectively.

plt.figure(figsize=(5,5))
plt.bar(range(4), explained_variances)
plt.xlabel("dimensions")
plt.ylabel("percentange of explained variance")

In [None]:
# The input features (X_scaled) are multiplied by the transpose of the eigenvector matrix (vectors.T), which gives the projections of the features onto the PCs.
projected_1 = X_scaled.dot(vectors.T[0])
projected_2 = X_scaled.dot(vectors.T[1])

# projections are then added to a Pandas data frame with the labels (y) using the res data frame.
res = pd.DataFrame(projected_1, columns = ['PC1'])
res['PC2'] = projected_2
res['Y'] = y

res.head()


In [None]:
# a scatter plot is created using the Seaborn library's FacetGrid function.

# The FacetGrid function creates a grid of subplots and assigns the projections to the subplots. The hue parameter specifies the categorical variable to use for coloring the points, and the height parameter specifies the height of the plot.

# The map function is used to plot the scatter plot using the scatter function from Matplotlib's pyplot module. The add_legend function is used to add a legend to the plot.

import seaborn as sns

sns.FacetGrid(res, hue="Y", height=6).map(plt.scatter, 'PC1', 'PC2').add_legend()
plt.show()