In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [21]:
# create a dataset with 40 samples
np.random.seed(42)

mu_vec1 = np.array([0, 0, 0])
cov_mat1 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 20)

df = pd.DataFrame(class1_sample,columns=['feature1','feature2','feature3'])
df['target'] = 1

mu_vec2 = np.array([1,1,1])
cov_mat2 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 20)

df1 = pd.DataFrame(class2_sample,columns=['feature1','feature2','feature3'])

df1['target'] = 0

df = pd.concat([df, df1],ignore_index=True)

df = df.sample(40)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40 entries, 32 to 16
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   feature1  40 non-null     float64
 1   feature2  40 non-null     float64
 2   feature3  40 non-null     float64
 3   target    40 non-null     int64  
dtypes: float64(3), int64(1)
memory usage: 1.6 KB


In [23]:
df.head()

Unnamed: 0,feature1,feature2,feature3,target
32,1.29612,1.261055,1.005113,0
36,1.25755,0.925554,-0.918771,0
35,1.404051,2.886186,1.174578,0
22,0.92799,2.003533,1.361636,0
11,-1.057711,0.822545,-1.220844,1


In [25]:
import plotly.express as px
#y_train_trf = y_train.astype(str)
fig = px.scatter_3d(df, x=df['feature1'], y=df['feature2'], z=df['feature3'],
              color=df['target'].astype('str'))
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))



ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [26]:
# Step 1 - Apply standard scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df.iloc[:,0:3] = scaler.fit_transform(df.iloc[:,0:3])

In [27]:
# Step 2 - Find Covariance Matrix
covariance_matrix = np.cov([df.iloc[:,0],df.iloc[:,1],df.iloc[:,2]])
print('Covariance Matrix:\n', covariance_matrix)

Covariance Matrix:
 [[1.02564103 0.35175    0.07987862]
 [0.35175    1.02564103 0.28328008]
 [0.07987862 0.28328008 1.02564103]]


In [28]:
# Step 3 - Finding EV and EVs
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)

In [29]:
eigen_values, eigen_vectors

(array([1.51825759, 0.94771609, 0.6109494 ]),
 array([[-0.56014874, -0.62132881,  0.5478904 ],
        [-0.67561935, -0.04005698, -0.73616162],
        [-0.47934526,  0.78252536,  0.3973439 ]]))

In [30]:
%pylab inline

from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
from matplotlib.patches import FancyArrowPatch


class Arrow3D(FancyArrowPatch):
    def __init__(self, xs, ys, zs, *args, **kwargs):
        FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
        self._verts3d = xs, ys, zs

    def draw(self, renderer):
        xs3d, ys3d, zs3d = self._verts3d
        xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
        self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
        FancyArrowPatch.draw(self, renderer)

fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(111, projection='3d')

ax.plot(df['feature1'], df['feature2'], df['feature3'], 'o', markersize=8, color='blue', alpha=0.2)
ax.plot([df['feature1'].mean()], [df['feature2'].mean()], [df['feature3'].mean()], 'o', markersize=10, color='red', alpha=0.5)
for v in eigen_vectors.T:
    a = Arrow3D([df['feature1'].mean(), v[0]], [df['feature2'].mean(), v[1]], [df['feature3'].mean(), v[2]], mutation_scale=20, lw=3, arrowstyle="-|>", color="r")
    ax.add_artist(a)
ax.set_xlabel('x_values')
ax.set_ylabel('y_values')
ax.set_zlabel('z_values')

plt.title('Eigenvectors')

plt.show()
     

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


AttributeError: 'Arrow3D' object has no attribute 'do_3d_projection'

<Figure size 700x700 with 1 Axes>

In [31]:
pc = eigen_vectors[0:2]
pc

array([[-0.56014874, -0.62132881,  0.5478904 ],
       [-0.67561935, -0.04005698, -0.73616162]])

In [32]:
transformed_df = np.dot(df.iloc[:,0:3],pc.T)
# 40,3 - 3,2
new_df = pd.DataFrame(transformed_df,columns=['PC1','PC2'])
new_df['target'] = df['target'].values
new_df.head()
     

Unnamed: 0,PC1,PC2,target
0,-0.730348,-1.19579,0
1,-1.406492,0.025739,0
2,-1.556794,-1.45031,0
3,-0.666888,-1.103748,0
4,0.250435,2.314996,1


In [34]:
new_df['target'] = new_df['target'].astype('str')
fig = px.scatter(x=new_df['PC1'],
                 y=new_df['PC2'],
                 color=new_df['target'],
                 color_discrete_sequence=px.colors.qualitative.G10
                )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

#### further code

https://www.kaggle.com/code/premkumar001/pca-demo-1/edit