# Module 5: Unsupervised

In [None]:
# Setup the matplotlib styling
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
import pandas as pd
import numpy as np

try:
    # Try to use the BI style sheet for plots
    line1 = (0/256, 224/256, 170/256)
    line2 = (96/256, 126/256, 229/256)
    line3 = (136/256, 76/256, 255/256)
    plt.style.use('matplotlibrc')
    plt.rcParams['axes.prop_cycle'] = plt.cycler(color=[(136/256, 76/256, 255/256), (60/256, 170/256, 207/256), (12/256, 229/256, 177/256)]) 
    
    colors = [(0.53125, 0.296875, 0.99609375), (0.453125, 0.3984375, 0.9453125), (0.375, 0.4921875, 0.89453125), (0.3046875, 0.578125, 0.8515625), (0.234375, 0.6640625, 0.80859375), (0.16015625, 0.75390625, 0.76171875), (0.09375, 0.8359375, 0.72265625), (0.046875, 0.89453125, 0.69140625), (0.0, 0.875, 0.6640625)]
    bicmap = LinearSegmentedColormap.from_list(name='BIcmp', 
                                                colors=colors,
                                                N=len(colors))
    cm_bright = ListedColormap([(0.53125, 0.296875, 0.99609375), (12/256, 229/256, 177/256)])
    colors = np.array([line1, line2, line3])
except:
    bicmap = plt.cm.BuGn 
    colors = ['r', 'g', 'b']

## **Exercise 5.1: Clustering**

Apply different cluster algorithms on the data set in the data folder:  Use k-means, gaussian mixture models und hierarchical clustering.

In [None]:
data = pd.read_csv('data/blobs.csv')
data.head()

Use the scatterplot function of the pandas module to visualize the results.

In [None]:
# Currently we are using only one color of the colors array
data.plot.scatter(x='feature_1', y='feature_2', color=colors[0])

### **Exercise 5.1.1: KMeans**

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Initialize the KMeans algorithm
# TODO
# Fit the model on the data
# TODO

In [None]:
# Visualize the prediction of the algorithm with scatter() by passing the the predicted cluster assignment to the colors array
# Tip: colors[kmeans.predict(data)]
# TODO

**How do we choose the correct number of clusters (when we have more than 3 features)**

We want to choose the number of clusters in a way that they maximize the silhouette score.

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
# Log all scores
s_score = []
# Test each number of clusters d from 2 to 20
num_clusters = list(range(2, 20+1))

for d in num_clusters:
    # Intialize a new KMeans model with n_clusters=d
    # TODO
    # Fit the KMeans model
    # TODO
    # Calculate the silhouette score with (data, kmeans.predict(data))
    score = None # TODO
    # append the calculated score to the list
    s_score.append(score)

In [None]:
# Plot the result and the select the number of clusters
plt.plot(num_clusters, s_score)

plt.xlabel('#clusters')
plt.ylabel('silhouette_score')
plt.gca().set_xticks(num_clusters)
plt.gca().set_xticklabels(num_clusters)
plt.show()

**Rerun your code with the optimal number of clusters**

In [None]:
# TODO

### **Exercise 5.1.2: Gaussian Mixture**

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
# Intialize the GaussianMixture model with n_components=3
# TODO
# Fit the model on the data
# TODO

In [None]:
# Visualize the prediction as you have done before.
# TODO

### **Exercise 5.1.3: Hierarchical Clustering**

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
from scipy.cluster.hierarchy import dendrogram

# Helper function to plot the dendogram
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
# Initialize the AgglomerativeClustering algorithm with distance_threshold=0, n_clusters=None
model = None # TODO
# Fit the clustering algorithm
# TODO

In [None]:
plt.title('Hierarchical Clustering Dendrogram')
plot_dendrogram(model, truncate_mode='level')
plt.xlabel("Data points")
plt.gca().set_xticklabels([])
plt.show()

**Based on the dendogram, how many clusters should you choose?**

3! Because for the three remaining clustering algorithms we have the largest distance until the next merge happens.

**Rerun the code. This time specify the number of clusters and plot the result**

In [None]:
# Initialize the AgglomerativeClustering algorithm with n_clusters=? (?=your chosen number of clusters)
# TODO
# You don't need to fit the algorithm

In [None]:
# Visualize the prediction as you have done before.
# This time instead of predict use fit_predict(data)
# TODO

### **Exercise 5.1.4: DBSCAN**

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
# Initialize dbscan
# TODO
# You don't need to fit the algorithm

In [None]:
# Adds a new color for outliers
extended_colors = np.append(colors, [[0, 0, 0]], axis=0)

In [None]:
# Visualize the prediction as you have done before.
# This time instead of predict use fit_predict(data)
# Use extended_colors to visualize outliers
# TODO

**Change the hyperparameters so that DBSCAN finds all three clusters, while ingoring the outliers.**

## **Exercise 5.2: Dimensionality Reduction**

Have a look at the digits dataset. What does the data set contain? Perform PCA and MDS on digits set and visualize result.

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()

In [None]:
plt.imshow(digits["images"][0], cmap="Greys")
plt.show()

plt.figure()
plt.imshow(digits["images"][13], cmap="Greys")
plt.show()

### **Exercise 5.2.1: PCA**

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Intialize PCA with n_components=2
# TODO
# Project the digits into the two dimensional space using fit_transform(digits.data)
projected = None # TODO

In [None]:
# Visualize the results
plt.figure()
plt.scatter(projected[:, 0], projected[:, 1],
            c=digits.target, edgecolor='none', alpha=0.5,
            cmap=bicmap)
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar()
plt.show()

### **Exercise 5.2.2: MDS**

In [None]:
from sklearn.manifold import MDS

In [None]:
# Intialize mds with n_components=2
# TODO
# Reduce the number of dimensions with fit_transform
projected = None # TODO

In [None]:
# Visualize the results
plt.figure()
plt.scatter(projected[:, 0], projected[:, 1],
            c=digits.target, edgecolor='none', alpha=0.5,
            cmap=bicmap)
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar()
plt.show()

### **Exercise 5.2.3: t-SNE**

In [None]:
from sklearn.manifold import TSNE

In [None]:
# Initialize tsne with n_components=2
# TODO
# Reduce the number of dimensions with fit_transform
projected = None # TODO

In [None]:
# Visualize the results
plt.figure()
plt.scatter(projected[:, 0], projected[:, 1],
            c=digits.target, edgecolor='none', alpha=0.5,
            cmap=bicmap)
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar()
plt.show()

## **Exercise 5.3: Outlier Detection**

We want to find abnormal transactions in credit card payments. For that you find data for the credit card transactions of a person in the file data/credit_card_data.csv. 

In [None]:
transactions = pd.read_csv('data/credit_card_data.csv')
transactions.head()

### **Exercise 5.3.1**

Visualize the data using pandas scatter. Do you have an explanation for what you see? Which transactions do you think are outliers?

In [None]:
transactions.plot.scatter(x='AMOUNT_SPEND', y='DISTANCE_TO_HOME')

### **Exercise 5.3.2**

Find outliers using the Isolation Forest algorithm. Is the result as you would expect?

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
# Initialize an IsolationForest object
# TODO
# Fit the model on the data
# TODO

In [None]:
# Visualize the outlier score
# To visualize the probability of being an outlier from isolationforest use
# cmap=bicamp and c=isolationforest.predict(transactions)
# TODO
plt.show()

**Is the result what you expected?**

### **Exercise 5.3.3**

What could you do to handle the outliers in the two separate regions of the separately? 

Think about how you could transform the data to get a more intuitive result. Hint: The logarithm might help.

In [None]:
# Intialize the a kmeans object with (n_clusters=2)
# TODO
# Fit kmeans on the transactions
# TODO

In [None]:
# Based on the kmeans algorithm devide the data into two chunks
# Tip: You can index rows in a DataFrame with transactions[cond]
# Tip: You can create conditions by performing boolean operations on a numpy array

# Save the predicted cluster assignment of kmeans in a new variable
# TODO

# Create the condition prediction == 0 and save it in a variable
# TODO
# Index transactions with the condition and save the view on the transactions in a new variable
# TODO
# We also need to scale the transactions in this chunk with np.log()
# TODO

# Create the condition prediction == 1 and save it in a variable
# TODO
# Index transactions with the condition and save the view on the transactions in a new variable
# TODO
# We also need to scale the transactions in this chunk with np.log()
# TODO

In [None]:
# Fit one isolation forest on each chunk of transactions
# This time also add contamination=0.002 to the construction of the isolation forest object

# Initialize the isolationforest0
# TODO
# Fit the isolationforest0 on the first chunk
# TODO

# Intialize the isolationforest1
# TODO
# Fit the isolationforest1 on the second chunk
# TODO

In [None]:
# Finally, visualize the results
# Tip: Use the code you used before for plotting
# Tip: To connect both plots save the ax object returend from the first .plot.scatter() and then pass it to the second .plot.scatter(ax=ax)
# TODO

**Do the results now make more sense?**

### **Exercise 5.3.4**

New transcations come in. They are given in the file data/credit_card_new_transactions.csv. How would you detect whether these are abnormal or not?

In [None]:
new_transactions = pd.read_csv('data/credit_card_new_transactions.csv')
new_transactions.head()

In [None]:
# First separate the samples into the two clusters
# TODO

# Create the condition new_prediction == 0 and save it in a variable
# TODO
# Index new_transactions with the condition and save the view on the new_transactions in a new variable
# TODO
# We also need to scale the new_transactions in this chunk with np.log
# TODO

# Create the condition new_prediction == 1 and save it in a variable
# TODO
# Index new_transactions with the condition and save the view on the new_transactions in a new variable
# TODO
# We also need to scale the new_transactions in this chunk with np.log
# TODO

In [None]:
# Classify the samples as outliers or not
# TODO

## **Exercise 5.4**

Load the cancer data set from sklearn.

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
cancer_data = pd.DataFrame(cancer.data, columns=cancer.feature_names)
cancer_data['target'] = cancer.target
cancer_data.head()

Have a look at the data. How many samples do you have? How many features? What are the features? What is the target? Is this a classification or regression problem?

Use the function pairplot of seaborn to visualize the components Worst Texture, Worst Symmetry, Mean Concave Points and Mean radius together. What do you notice?

In [None]:
features = ['worst texture', 'worst symmetry', 'mean concave points', 'mean radius']

In [None]:
pd.plotting.scatter_matrix(cancer_data[features], figsize=(10, 10))
plt.show()

### **Exercise 5.4.1**

Perform a PCA on the data set. Remember: PCA needs normalization before applying it in a useful manner. Use sklearn.preprocessing.StandardScaler to normalize the data. Then perform a PCA with sklearn.decomposition.PCA and three components. Print out the explained variances.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
# TODO

### **Exercise 5.4.2**
Visualize the first two components together with the target. What do you notice?

In [None]:
# TODO