# Scotch Exploration

Playing with the scotch dataset. Seeing what we we could do with it.

References

* https://www.mathstat.strath.ac.uk/outreach/nessie/datasets/whiskies.txt
* http://wonkviz.tumblr.com/post/72400253092/whiskey-data-sleuthing-with-help-from-reddit
* http://blog.revolutionanalytics.com/2013/12/k-means-clustering-86-single-malt-scotch-whiskies.html

In [1]:
%matplotlib inline

In [2]:
from __future__ import division
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 25)
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', 1024)

Get the whiskey data from https://www.mathstat.strath.ac.uk/outreach/nessie/datasets/whiskies.txt.

In [None]:
#!wget https://www.mathstat.strath.ac.uk/outreach/nessie/datasets/whiskies.txt

In [3]:
df = pd.read_csv('datasets/whiskies.txt')

In [4]:
df.head(1)

Unnamed: 0,RowID,Distillery,Body,Sweetness,Smoky,Medicinal,Tobacco,Honey,Spicy,Winey,Nutty,Malty,Fruity,Floral,Postcode,Latitude,Longitude
0,1,Aberfeldy,2,2,2,0,0,2,1,2,2,2,2,2,\tPH15 2EB,286580,749680


## PCA

In [5]:
import sklearn.datasets
import sklearn.metrics as metrics
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, MeanShift
from sklearn.preprocessing import StandardScaler

In [6]:
X = df.drop(['RowID', 'Distillery', 'Postcode', ' Latitude', ' Longitude'], axis=1)

In [7]:
X.describe()

Unnamed: 0,Body,Sweetness,Smoky,Medicinal,Tobacco,Honey,Spicy,Winey,Nutty,Malty,Fruity,Floral
count,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0
mean,2.069767,2.290698,1.534884,0.546512,0.116279,1.244186,1.383721,0.976744,1.465116,1.802326,1.802326,1.697674
std,0.93041,0.717287,0.863613,0.990032,0.322439,0.853175,0.784686,0.93276,0.82173,0.629094,0.779438,0.855017
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,2.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
50%,2.0,2.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0
75%,2.0,3.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0
max,4.0,4.0,4.0,4.0,1.0,4.0,3.0,4.0,4.0,3.0,3.0,4.0


In [8]:
# X_std = StandardScaler().fit_transform(X)

In [9]:
# pd.DataFrame(X_std, columns=X.columns).describe()

In [10]:
n_components = 5

In [11]:
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

In [12]:
X.shape, X_pca.shape

((86, 12), (86, 5))

In [13]:
weights = np.round(pca.components_, 3)
ev = np.round(pca.explained_variance_ratio_, 3)

In [14]:
ev

array([0.301, 0.192, 0.096, 0.083, 0.072])

In [15]:
pca_df = pd.DataFrame(weights, columns=X.columns)

In [16]:
pca_df

Unnamed: 0,Body,Sweetness,Smoky,Medicinal,Tobacco,Honey,Spicy,Winey,Nutty,Malty,Fruity,Floral
0,0.361,-0.203,0.478,0.575,0.092,-0.221,0.058,-0.037,-0.048,-0.128,-0.202,-0.384
1,0.491,0.047,0.069,-0.161,-0.02,0.418,0.175,0.64,0.26,0.103,0.124,-0.131
2,-0.03,0.264,-0.219,-0.043,0.001,-0.11,-0.699,0.233,0.179,-0.108,-0.403,-0.343
3,0.075,0.371,-0.089,-0.082,0.033,-0.033,0.172,0.226,-0.851,-0.072,-0.095,-0.149
4,-0.227,-0.009,0.202,0.033,0.009,0.597,0.134,-0.111,-0.025,0.105,-0.703,0.12


In [17]:
import warnings
warnings.simplefilter(action = "ignore")

In [18]:
component_names = []
pca_df_t = pca_df.T
for col in pca_df_t:
    component = pca_df_t[col]
    order = component.abs().order(ascending=False)
    top = order.head(3)
    component_name = [name if component[name] > 0 else ('neg-' + name) for name in top.index]
    component_names.append('/'.join(component_name))

AttributeError: 'Series' object has no attribute 'order'

In [19]:
pca_df.index = component_names
pca_df

ValueError: Length mismatch: Expected axis has 5 elements, new values have 0 elements

## k-Means

In [None]:
inertia = [np.NaN]
for i in range(1,20):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

In [None]:
ax = plt.subplot(111)
ax.plot(inertia, 'o-')
ax.set_ylabel('inertia')
ax.set_xlabel('# clusters')

In [None]:
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

In [None]:
X['cluster'] = kmeans.labels_

In [None]:
X.cluster.value_counts()

## Plot PCA Points w/ Cluster Info

In [None]:
X_pca_clustered = np.insert(X_pca, n_components, values=kmeans.labels_, axis=1)

In [None]:
pca_cluster_df = pd.DataFrame(X_pca_clustered, columns=component_names + ['cluster'])

In [None]:
g = sns.PairGrid(pca_cluster_df, hue='cluster', vars=component_names, size=3)
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()

## Similarities

In [None]:
dist = sklearn.metrics.pairwise.euclidean_distances(X)

In [None]:
sim = sklearn.metrics.pairwise.cosine_similarity(X)

In [None]:
dist_df = pd.DataFrame(dist, columns=df.Distillery, index=df.Distillery)

In [None]:
sim_df = pd.DataFrame(sim, columns=df.Distillery, index=df.Distillery)

In [None]:
cluster_s = X.cluster
cluster_s.index = df.Distillery

In [None]:
sim_df['Ardbeg'].order(ascending=False)

In [None]:
features_df = df.ix[:,1:13].set_index('Distillery')

## Persist

Add the cluster info to the features DataFrame so we only have to persist one file for both.

In [None]:
features_df['cluster'] = cluster_s

In [None]:
features_df.to_pickle('datasets/features.dataframe')

In [None]:
sim_df.to_pickle('datasets/sims.dataframe')