In [None]:
!pip install numpy
!pip install matplotlib
!pip install scikit-learn
!pip install umap-learn
!pip install pandas

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.manifold import TSNE,Isomap
from umap import UMAP
import pandas as pd
from sklearn.metrics import pairwise_distances

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.preprocessing import StandardScaler

In [3]:


n_samples = 10000
n_features = 10
n_informative = 8
n_redundant = 2
n_classes = 2

X, y = make_classification(
    n_samples=n_samples, 
    n_features=n_features, 
    n_classes=n_classes, 
    n_informative=n_informative, 
    n_redundant=n_redundant, 
    n_clusters_per_class=2, 
    class_sep=1.5, 
    random_state=42
)



In [4]:
feature_names = [f'feature_{i+1}' for i in range(n_features)]


df_features = pd.DataFrame(X, columns=feature_names)


df = df_features.copy()
df['target'] = y
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,target
0,-2.482531,0.525135,-0.598951,-2.410959,-0.990930,-3.112120,-2.840507,-0.572464,3.171115,-0.782214,1
1,-2.842556,-0.564530,0.096068,-0.815907,3.120619,-0.256869,-5.424947,0.809551,0.058091,8.011134,1
2,3.080255,1.789750,1.771519,2.924127,-1.811381,-1.350302,2.976814,-0.158628,2.734609,-7.104737,0
3,-0.647513,-3.790428,2.014160,-0.585143,1.626927,-5.255379,0.320723,3.078251,0.862170,-3.282414,0
4,-0.731565,0.873713,1.726333,2.005599,-1.218096,0.728442,3.282940,-2.123311,3.492206,-8.392934,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,-1.999463,-2.826460,-0.703037,2.727000,-2.420999,-0.465184,0.605673,-0.721480,3.900589,-9.932152,0
9996,3.959000,1.900724,1.585159,2.672286,-4.300224,-3.274783,1.397944,-3.899414,-0.424083,-8.202205,0
9997,-3.021406,-2.273686,4.344588,-1.683138,3.043479,-3.205070,-2.935192,4.993852,2.917857,2.610618,0
9998,0.969112,3.597363,3.037797,0.119398,0.094230,-1.240834,0.838637,-2.383345,1.121974,-1.066480,0


In [5]:
df["target"].value_counts()

target
0    5001
1    4999
Name: count, dtype: int64

In [6]:
# Standardizing the feature
scaler=StandardScaler()

X_scaled=scaler.fit_transform(X)

## Applying t-SNE

In [7]:
tsne=TSNE(n_components=3,random_state=42,n_jobs=-1)
X_tsne=tsne.fit_transform(X_scaled)

## Applying UMAP

In [8]:
umap=UMAP(n_components=3,random_state=42,n_jobs=-1)
X_umap=umap.fit_transform(X_scaled)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


## Applying Isomap


In [9]:
isomap=Isomap(n_components=3,n_jobs=-1)
X_isomap=isomap.fit_transform(X_scaled)

## Comparing t-SNE,Isomap,and UMAP using Sklearn trustworthiness 

In [10]:
from sklearn.manifold import trustworthiness
print("t-SNE truthworthiness",trustworthiness(X_scaled,X_tsne))
print("UMAP truthworthiness",trustworthiness(X_scaled,X_umap))
print("Isomap truthworthiness",trustworthiness(X_scaled,X_isomap))

t-SNE truthworthiness 0.9986795376301041
UMAP truthworthiness 0.9839488130504404
Isomap truthworthiness 0.930199245396317


In [11]:
X_tsne[:,0]

array([ -8.667209, -18.230059,  16.932846, ..., -17.157583,  13.028165,
        16.258146], dtype=float32)

In [12]:
import plotly.express as px
px.scatter_3d(x=X_tsne[:,0],y=X_tsne[:,1],z=X_tsne[:,2],color=df["target"])

In [13]:
import plotly.express as px
px.scatter_3d(x=X_umap[:,0],y=X_umap[:,1],z=X_umap[:,2],color=df["target"])

In [14]:
import plotly.express as px
px.scatter_3d(x=X_isomap[:,0],y=X_isomap[:,1],z=X_isomap[:,2],color=df["target"])

In [15]:
y

array([1, 1, 0, ..., 0, 0, 0])

In [16]:
# Prediction Comparison
from sklearn.linear_model import LogisticRegression
lreg=LogisticRegression()

In [17]:
lreg.fit(X_scaled,y)
lreg.score(X_scaled,y)

0.8102

In [18]:
#Tsne
lreg.fit(X_tsne,y)
lreg.score(X_tsne,y)

0.8635

In [19]:
#isomap
lreg.fit(X_isomap,y)
lreg.score(X_isomap,y)

0.9094

In [20]:
lreg.fit(X_umap,y)
lreg.score(X_umap,y)

0.963