<a href="https://colab.research.google.com/github/proteus21/DATA-SCIENCE-STUDY/blob/main/Machine%20Learning/07_dimensional%20reduce/07_pca_math_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### UCZENIE NIENADZOROWANE /  UNSUPERVISED LEARNING

#  Dimensionality reduce - PCA method

A basic library for machine learning in Python
To install the scikit-learn library, use the command below:
```
!pip install scikit-learn
```
To update to the latest version of the scikit-learn library, use the command below:
```
!pip install --upgrade scikit-learn
```

### Contents:
1. [Import libraries](#0)
2. [Data generation ](#1)
3. [Data visualization](#2)
4. [Standarization](#3)
5. [PCA implementation](#4)
6. [PCA Sckit-learn](#5)


### <a name='0'></a> Import libraries

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(font_scale=1.3)
np.random.seed(42)
np.set_printoptions(precision=8, suppress=True, edgeitems=5, linewidth=200)


### <a name='1'></a> Data generation

In [None]:
from sklearn.datasets import load_iris


raw_data=load_iris()
target=raw_data['target']
data=raw_data['data']
feature_names=(raw_data['feature_names'])
feature_names = [name.replace(' ', '_')[:-5] for name in feature_names]
df=pd.DataFrame(data=np.c_[data,target], columns=feature_names+['class'])
df['class'] = df['class'].map({0.0: 'setosa', 1.0: 'versicolor', 2.0: 'virginica'})
df.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
px.scatter_3d(df,  x='sepal_length', y='petal_length', z='petal_width', template='plotly_dark', title="Iris data-visualisation 3D", color='class', symbol='class', opacity=0.5, width=900, height=700) 

### <a name='2'></a> Standarization

In [None]:
from sklearn.preprocessing import StandardScaler
X = df.iloc[:, [0,2,3]]
y = df.iloc[:, -1]
y
scaler=StandardScaler()
X_std=scaler.fit_transform(X)
X_std[:5]

array([[-0.90068117, -1.34022653, -1.3154443 ],
       [-1.14301691, -1.34022653, -1.3154443 ],
       [-1.38535265, -1.39706395, -1.3154443 ],
       [-1.50652052, -1.2833891 , -1.3154443 ],
       [-1.02184904, -1.34022653, -1.3154443 ]])

### <a name='4'></a> PCA implementation

In [None]:
#conwargation matrix
cov_mat=np.cov(X_std, rowvar=False)


array([[1.00671141, 0.87760447, 0.82343066],
       [0.87760447, 1.00671141, 0.96932762],
       [0.82343066, 0.96932762, 1.00671141]])

In [None]:
# the eigenvectors and corresponding eigenvalues of the covariance matrix  / wektory własne i odpowiadające nim wartości własne macierzy kowariancji
eig_vals, eig_vecs=np.linalg.eig(cov_mat)

print(f'Eigenvalues:\n{eig_vals}\n')
print(f'Eigenvectors:\n{eig_vecs}\n')

Eigenvalues:
[2.78833033 0.20075012 0.03105378]

Eigenvectors:
[[ 0.55964149  0.81270446  0.16221241]
 [ 0.59148855 -0.2546058  -0.76506024]
 [ 0.58046765 -0.52410624  0.62319335]]



In [49]:
# posortowanie wektorów według wartości własnych / sorting of vectors according to eigenvalues

eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]
eig_pairs.sort(reverse=True)
eig_pairs

[(2.7883303296752913, array([0.55964149, 0.59148855, 0.58046765])),
 (0.20075011806343807, array([ 0.81270446, -0.2546058 , -0.52410624])),
 (0.031053780449190244, array([ 0.16221241, -0.76506024,  0.62319335]))]

In [50]:
## obliczenie wartości procentowej wyjaśnionej wariancji / calculation of percentage of explained variance
total=sum(eig_vals)
explained_variance_ratio=[(i/total) for i in sorted(eig_vals, reverse=True)]
explained_variance_ratio

[0.9232471536035964, 0.06647059464767172, 0.010282251748731881]

In [51]:
cumulative_explained_variance=np.cumsum(explained_variance_ratio)
cumulative_explained_variance

array([0.92324715, 0.98971775, 1.        ])

In [52]:
results=pd.DataFrame(data={'explained_variance_ratio':explained_variance_ratio})
results['cumulative'] = results['explained_variance_ratio'].cumsum()
results['component'] = results.index + 1
results

Unnamed: 0,explained_variance_ratio,cumulative,component
0,0.923247,0.923247,1
1,0.066471,0.989718,2
2,0.010282,1.0,3


In [53]:
fig = go.Figure(data=[go.Bar(x=results['component'], y=results['explained_variance_ratio'], name='explained variance ratio'),
                      go.Scatter(x=results['component'], y=results['cumulative'], name='cumulative explained variance')],
                layout=go.Layout(title='PCA - 3 components', width=950, template='plotly_dark'))
fig.show()

In [54]:
eig_pairs

[(2.7883303296752913, array([0.55964149, 0.59148855, 0.58046765])),
 (0.20075011806343807, array([ 0.81270446, -0.2546058 , -0.52410624])),
 (0.031053780449190244, array([ 0.16221241, -0.76506024,  0.62319335]))]

In [56]:
# 2 komponenty, W - macierz składająca się z 2 wektorów własnych mających największą wartość własną /# 2 components, W - matrix consisting of 2 eigenvectors having the largest eigenvalue
W = np.hstack((eig_pairs[0][1].reshape(3, 1), eig_pairs[1][1].reshape(3, 1)))
W

array([[ 0.55964149,  0.81270446],
       [ 0.59148855, -0.2546058 ],
       [ 0.58046765, -0.52410624]])

In [57]:
X_pca=X_std.dot(W)
pca_df=pd.DataFrame(data=X_pca, columns=['pca_1', 'pca_2'])
pca_df['class'] = df['class']
pca_df['pca_2'] = - pca_df['pca_2']
pca_df

Unnamed: 0,pca_1,pca_2,class
0,-2.060360,-0.298674,setosa
1,-2.195981,-0.101727,setosa
2,-2.365221,0.080749,setosa
3,-2.365794,0.208165,setosa
4,-2.128171,-0.200201,setosa
...,...,...,...
145,1.906692,0.124424,virginica
146,1.262579,0.213420,virginica
147,1.541846,0.114404,virginica
148,1.634876,0.645735,virginica


In [59]:
px.scatter(pca_df, 'pca_1','pca_2',color='class', width=950, template='plotly_dark')

### <a name='5'></a> PCA - scikit-learn

In [60]:
from sklearn.decomposition import PCA

pca=PCA(n_components=2)
X_pca=pca.fit_transform(X_std)
pca_df=pd.DataFrame(data=X_pca, columns=['pca_1','pca_2'])
pca_df['class']=df['class']
pca_df

Unnamed: 0,pca_1,pca_2,class
0,-2.060360,-0.298674,setosa
1,-2.195981,-0.101727,setosa
2,-2.365221,0.080749,setosa
3,-2.365794,0.208165,setosa
4,-2.128171,-0.200201,setosa
...,...,...,...
145,1.906692,0.124424,virginica
146,1.262579,0.213420,virginica
147,1.541846,0.114404,virginica
148,1.634876,0.645735,virginica


In [61]:
px.scatter(pca_df, 'pca_1', 'pca_2', color='class', width=950, template='plotly_dark')
     