## PCA sklearn - real data

In [None]:
# pandas and numpy imports
import pandas as pd
import numpy as np

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

# import PCA
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# set sns theme and set pandas to display all rows and columns
sns.set_theme()


## Iris data demo

In [None]:
# Load iris dataset
iris = load_iris()

# Format iris dataset as dataframe
numeric_iris_df = pd.DataFrame(
    iris['data'],
    columns=iris['feature_names']
)

iris_df = numeric_iris_df.copy()
iris_df['label'] = iris['target_names'][iris['target']]


In [None]:
# Scale the data
standard_scaler = StandardScaler()
standard_scaler.fit(numeric_iris_df)
numeric_iris_df = standard_scaler.transform(numeric_iris_df)

In [None]:
# Perform PCA
pca = PCA()
pca.fit(numeric_iris_df)
pca_transformed_data = pca.transform(numeric_iris_df)

In [None]:
pca_transformed_data.shape

In [None]:
# Determine variance explained by each component
[var_ration*100 for var_ration in pca.explained_variance_ratio_]

In [None]:
# Plot first two principal components
iris_df_pca = pd.DataFrame(pca_transformed_data[:,:2], columns=['PC1', 'PC2'])
iris_df_pca['label'] = iris_df['label']

In [None]:
sns.scatterplot(iris_df_pca, x='PC1', y='PC2', hue='label')
plt.xlabel('PC1 (72.9 % variance)')
plt.ylabel('PC2 (22.9 % variance)')
plt.title('Iris dataset')
plt.gca().set_aspect('equal', adjustable='box')

In [None]:
pca.components_

In [None]:
iris_df.head().T

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(iris_df_pca, x='PC1', y='PC2', hue='label')

feat_names = iris_df.columns
colors = ['red', 'black', 'purple', 'brown']

loadings = pca.components_

for i, feat_name, color in zip(range(loadings.shape[1]), feat_names, colors):
    
    plt.quiver(
        0, 0, 
        loadings[0, i],
        loadings[1, i], 
        angles='xy', scale_units='xy', 
        scale=1, color=color, 
        label=feat_name, 
        width=0.002
    )
    
plt.xlabel('PC1 (72.9 % variance)')
plt.ylabel('PC2 (22.9 % variance)')
plt.legend()
plt.gca().set_aspect('equal', adjustable='box')
plt.title('Iris dataset')


### Iris - differences between solvers

In [None]:
pca_full = PCA(svd_solver='full').fit(numeric_iris_df)
pca_arpack = PCA(svd_solver='arpack').fit(numeric_iris_df)
pca_random = PCA(svd_solver='randomized').fit(numeric_iris_df)

In [None]:
print(pca_full.explained_variance_ratio_)
print(pca_arpack.explained_variance_ratio_)
print(pca_random.explained_variance_ratio_)

In [None]:
print(pca_full.components_)
print(pca_arpack.components_)
print(pca_random.components_)

## Wine dataset

The Wine dataset is a classic multivariate dataset used for classification tasks in machine learning. 
- It consists of 178 samples of wine from three different cultivars (classes) in the same region in Italy. 
- Each sample has 13 continuous attributes (features) that are the result of a chemical analysis of the wines. 
- The goal of using this dataset is usually to build a classifier that can predict the cultivar of a wine based on its chemical composition.

The dataset contains following features:

- **Alcohol**: The alcohol content in the wine, measured in percentage.
- **Malic Acid**: The amount of malic acid in the wine, affecting its tartness.
- **Ash**: The measure of the non-aqueous residue remaining after heating, affecting wine stability.
- **Alcalinity of Ash**: A measure of the alkalinity of the ash formed post-combustion, impacting wine's acidity.
- **Magnesium**: The amount of magnesium in the wine, a cofactor in enzyme reactions.
- **Total Phenols**: The total amount of phenolic compounds, contributing to taste and mouthfeel.
- **Flavanoids**: A subset of phenolic compounds known to have antioxidant properties.
- **Nonflavanoid Phenols**: Phenolic compounds that are not flavanoids, contributing to color and flavor.
- **Proanthocyanins**: A class of phenolic compounds affecting color and astringency.
- **Color Intensity**: The intensity of the wine's color, measured optically.
- **Hue**: The color attribute that describes a pure color, usually measured via spectrophotometry.
- **OD280/OD315 of diluted wines**: The absorbance ratio at 280 and 315 nm, indicating protein content and wine stability.
- **Proline**: The amount of the amino acid proline, often linked to wine quality.

In [None]:
from sklearn.datasets import load_wine

# Load the wine dataset
wine_data = load_wine()

wine_df_numeric = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)

wine_df = wine_df_numeric.copy()
wine_df['label'] = wine_data.target

In [None]:
wine_df.shape

In [None]:
# Label names
target_names = wine_data.target_names
target_names

In [None]:
wine_df.head()

In [None]:
wine_df.isnull().sum().T

In [None]:
wine_df.describe().T

In [None]:
# Plot all the feature pairs
_ = wine_df_numeric.hist(figsize=(9,9))
plt.tight_layout()


In [None]:
# Plot each individual feature
plt.figure(figsize=(9,7))
g = sns.boxplot(wine_df_numeric)
plt.title('Wine features')
_ = g.set_xticklabels(g.get_xticklabels(), rotation=90)

### Run PCA without scaling

In [None]:
# Perform PCA without scaling the data
pca = PCA() 
pca.fit(wine_df_numeric)
pca_transformed_data = pca.transform(wine_df_numeric)

# Determine variance explained by each component
[var_ration*100 for var_ration in pca.explained_variance_ratio_]

In [None]:
[comp*100 for comp in pca.components_[0]]

In [None]:
# Plot first two principal components
plt.figure(figsize=(20,20))
wine_df_pca = pd.DataFrame(pca_transformed_data[:,:2], columns=['PC1', 'PC2'])
wine_df_pca['label'] = wine_df['label']

sns.scatterplot(wine_df_pca, x='PC1', y='PC2', hue='label')
plt.title('Top 2 PCs - unscaled data')
plt.gca().set_aspect('equal', adjustable='box')

In [None]:
# Plot proline
plt.figure(figsize=(20,2))

sns.scatterplot(wine_df, x='proline', y='label', hue='label')
plt.title('Proline feature vs vine types')

### Run PCA with scaling

In [None]:
# Scale the data
standard_scaler = StandardScaler()
standard_scaler.fit(wine_df_numeric)
wine_df_numeric_scaled = standard_scaler.transform(wine_df_numeric)
wine_df_numeric = pd.DataFrame(
    wine_df_numeric_scaled, 
    columns = wine_df_numeric.columns
)

plt.figure(figsize=(9,7))
g = sns.boxplot(wine_df_numeric)
plt.title('Wine scaled features')
_ = g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [None]:
# Plot all the feature pairs
_ = wine_df_numeric.hist(figsize=(9,9))
plt.tight_layout()

In [None]:
# Perform PCA
pca = PCA()
pca.fit(wine_df_numeric)
pca_transformed_data = pca.transform(wine_df_numeric)

In [None]:
# Determine variance explained by each component
[var_ration*100 for var_ration in pca.explained_variance_ratio_]

In [None]:
[comp*100 for comp in pca.components_[0]]

In [None]:
# Plot first two principal components
wine_df_pca = pd.DataFrame(pca_transformed_data[:,:2], columns=['PC1', 'PC2'])
wine_df_pca['label'] = wine_df['label']

In [None]:
plt.figure(figsize=(7,7))
sns.scatterplot(wine_df_pca, x='PC1', y='PC2', hue='label')


plt.xlabel('PC1 [36 % variance]')
plt.ylabel('PC1 [19 % variance]')
