# 2D&3D PCA, t-SNE, and UMAP on Wine Dataset

In this notebook I explore three different dimensionality reduction techniques - PCA, t-SNE, and UMAP. Although they are three different methods/techniques, they all aim to reduce the dimensionality of the data by reducing the original number of features in to just two or three principal features. 

# Importing Libraries

In [None]:
# Importing all necessary libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Renaming the columns based on their features.
data_path = "/kaggle/input/wineuci/Wine.csv"

columns = ['class','alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
    'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
    'proanthocyanins', 'color_intensity', 'hue',
    'od280/od315_of_diluted_wines', 'proline']

df = pd.read_csv(data_path, names=columns, header=0)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Using the standard scaler method to get the values converted into integers between -3 and +3.
X = df.iloc[:, 1:14].values
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

In [None]:
X.shape

In [None]:
X

# 2D Principal Component Analysis

In [None]:
# Using Principal Component Analysis or PCA in short to reduce the dimensionality of the data in order to optimize the result of the clustering.
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents1 = pca.fit_transform(X)

In [None]:
principalComponents1

In [None]:
# Creating a dataframe featuring the two Principal components that we acquired through PCA.
PCA_dataset1 = pd.DataFrame(data = principalComponents1, columns = ['component1', 'component2'] )
PCA_dataset1.head()

In [None]:
# Extracting the two features from above in order to add them to the dataframe.
principal_component1 = PCA_dataset1['component1']
principal_component2 = PCA_dataset1['component2']

In [None]:
# Visualizing the effects of the Principal Component Analysis.
plt.figure()
plt.figure(figsize=(10,10))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title('2 Component PCA')
plt.scatter(PCA_dataset1['component1'], PCA_dataset1['component2'])

# 3D Principal Component Analysis

In [None]:
# Using Principal Component Analysis or PCA in short to reduce the dimensionality of the data in order to optimize the result of the clustering.
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
principalComponents2 = pca.fit_transform(X)

In [None]:
principalComponents2

In [None]:
# Creating a dataframe featuring the three Principal components that we acquired through PCA.
PCA_dataset2 = pd.DataFrame(data = principalComponents2, columns = ['component3', 'component4', 'component5'] )
PCA_dataset2.head()

In [None]:
# Extracting the three features from above in order to add them to the dataframe.
principal_component3 = PCA_dataset2['component3']
principal_component4 = PCA_dataset2['component4']
principal_component5 = PCA_dataset2['component5']

In [None]:
# Visualizing the results of the 3D PCA.
ax = plt.figure(figsize=(10,10)).gca(projection='3d')
plt.title('3D Principal Component Analysis (PCA)')
ax.scatter(
    xs=principal_component3, 
    ys=principal_component4, 
    zs=principal_component5, 
)
ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')
plt.show()

# 2D T-distributed Stochastic Neighbour Embedding

In [None]:
# Implementing t-SNE.
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results1 = tsne.fit_transform(X)

In [None]:
tsne_results1

In [None]:
# Creating a dataframe featuring the two principal components that we acquired through t-SNE.
tsne_dataset1 = pd.DataFrame(data = tsne_results1, columns = ['component1', 'component2'] )
tsne_dataset1.head()

In [None]:
# Extracting the two features from above in order to add them to the dataframe.
tsne_component1 = tsne_dataset1['component1']
tsne_component2 = tsne_dataset1['component2']

In [None]:
# Visualizing the effects of the T-distributed Stochastic Neighbour Embedding.
plt.figure()
plt.figure(figsize=(10,10))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title('2 Component TSNE')
plt.scatter(tsne_component1, tsne_component2)

# 3D T-distributed Stochastic Neighbour Embedding

In [None]:
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
tsne_results2 = tsne.fit_transform(X)

In [None]:
tsne_results2

In [None]:
# Creating a dataframe featuring the three Principal components that we acquired through t-SNE.
tsne_dataset2 = pd.DataFrame(data = tsne_results2, columns = ['component3', 'component4', 'component5'] )
tsne_dataset2.head()

In [None]:
# Extracting the three features from above in order to add them to the dataframe.
tsne_component3 = tsne_dataset2['component3']
tsne_component4 = tsne_dataset2['component4']
tsne_component5 = tsne_dataset2['component5']

In [None]:
# Visualizing the 3D t-SNE.
ax = plt.figure(figsize=(10,10)).gca(projection='3d')
plt.title('3D T-distributed Stochastic Neighbor Embedding (TSNE)')
ax.scatter(
    xs=tsne_component3, 
    ys=tsne_component4, 
    zs=tsne_component5, 
    #c = x_kmeans
)
ax.set_xlabel('tsne-one')
ax.set_ylabel('tsne-two')
ax.set_zlabel('tsne-three')
plt.show()

# 2D Uniform Manifold Approximation and Projection 

In [None]:
# Implementing UMAP.
import umap
embedding = umap.UMAP(n_neighbors=50,
                      min_dist=0.3,
                      metric='correlation').fit_transform(X)

In [None]:
umap_component1 = embedding[:,0]
umap_component2 = embedding[:,1]

In [None]:
# Visualizing the effects of the Uniform Manifold Approximation and Projection.
plt.figure()
plt.figure(figsize=(10,10))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title('2 Component UMAP')
plt.scatter(umap_component1, umap_component2)

# 3D Uniform Manifold Approximation and Projection 

In [None]:
import umap
embedding2 = umap.UMAP(n_components=3,
                      n_neighbors=50,
                      min_dist=0.3,
                      metric='correlation').fit_transform(X)

In [None]:
umap_component3 = embedding2[:,0]
umap_component4 = embedding2[:,1]
umap_component5 = embedding2[:,2]

In [None]:
# Visualizing the effects of the 3D UMAP.
ax = plt.figure(figsize=(10,10)).gca(projection='3d')
plt.title('3D Uniform Manifold Approximation and Projection (UMAP)')
ax.scatter(
    xs=umap_component3, 
    ys=umap_component4, 
    zs=umap_component5, 
    #c = x_kmeans
)
ax.set_xlabel('umap-3d-one')
ax.set_ylabel('umap-3d-two')
ax.set_zlabel('umap-3d-three')
plt.show()