<a href="https://colab.research.google.com/github/scharnk/Unsupervised-learning-in-python/blob/master/CH02_Visualization_with_hierarchical_clustering_and_t_SNE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#visualizations

* t-SNE -- 2-D map of dataset
* Hierarchical clustering
> * clusters are vertical lines
> * joining of vertical lines means a merging of clustering

In [0]:
# Perform the necessary imports
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Calculate the linkage: mergings
mergings = linkage(samples, method='complete')

# Plot the dendrogram, using varieties as labels
dendrogram(mergings,
           labels=varieties,
           leaf_rotation=90,
           leaf_font_size=6,
)
plt.show()


In [0]:
# Import normalize
from sklearn.preprocessing import normalize

# Normalize the movements: normalized_movements
normalized_movements = normalize(movements)

# Calculate the linkage: mergings
mergings = linkage(normalized_movements, method='complete')

# Plot the dendrogram
dendrogram(mergings, labels=companies, leaf_rotation=90, leaf_font_size=6)
plt.show()


* cluster labels can be used for cross-tabulations

#Distance in Hierarchal Dendrograms

* height on dendrogram specifies max distance between merging clusters
* Distance between clusters is determined by linkage method used
* 'complete' linkage: means distance between clusters is max distance between samples
* In single linkage, the distance between clusters is the distance between the closest points of the clusters

#SciPy clustering labels start at 1
#sklearn clustering labels are 0 indexed

In [0]:
# Perform the necessary imports
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram

# Calculate the linkage: mergings
mergings = linkage(samples, method='single')

# Plot the dendrogram
dendrogram(mergings, labels=country_names, leaf_rotation=90, leaf_font_size=6)
plt.show()


In [0]:
# Perform the necessary imports
import pandas as pd
from scipy.cluster.hierarchy import fcluster

# Use fcluster to extract labels: labels
labels = fcluster(mergings, 6, criterion='distance')

# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({'labels': labels, 'varieties': varieties})

# Create crosstab: ct
ct = pd.crosstab(df['labels'], df['varieties'])

# Display ct
print(ct)


#t-SNE (t-distributed stochastic neighbor embedding)

* maps samples from higher dimensional space into 2D or 3D 
* preserves nearned of samples
* great to inspect datasets


#t-SNE only has fit_transform() method

* so simultaneously fits & transforms data
* cannot run fit or transform separately
* can't extend map to include new data samples
* meaning you have to start over each time

#t-SNE learning rate

* if wrong choice, then all points will bunch together
* typically = 50 to 200

#t-SNE features will be different each time

* but every cluster will appear wrt the others the same way

In [0]:
# Import TSNE
from sklearn.manifold import TSNE

# Create a TSNE instance: model
model = TSNE(learning_rate=200)

# Apply fit_transform to samples: tsne_features
tsne_features = model.fit_transform(samples)

# Select the 0th feature: xs
xs = tsne_features[:,0]

# Select the 1st feature: ys
ys = tsne_features[:,1]

# Scatter plot, coloring by variety_numbers
plt.scatter(xs, ys, c=variety_numbers)
plt.show()


In [0]:
# Import TSNE
from sklearn.manifold import TSNE

# Create a TSNE instance: model
model = TSNE(learning_rate=50)

# Apply fit_transform to normalized_movements: tsne_features
tsne_features = model.fit_transform(normalized_movements)

# Select the 0th feature: xs
xs = tsne_features[:,0]

# Select the 1th feature: ys
ys = tsne_features[:,1]

# Scatter plot
plt.scatter(xs, ys, alpha=0.5)

# Annotate the points
for x, y, company in zip(xs, ys, companies):
    plt.annotate(company, (x, y), fontsize=5, alpha=0.75)
plt.show()
