In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.spatial.distance import cosine, pdist, squareform
from sklearn.manifold import MDS

import pandas as pd
import numpy as np

In [11]:
orig_data = 'data/all_drinks.csv'
df = pd.read_csv(orig_data)
df.head()

Unnamed: 0,name,alcohol,sugar,acid,oz,style,glass,melt,ingredients
0,Angostura Fizz,0.012531,0.075125,0.015161,3.99,fizz,coupe,0.33,Angostura bitters|lime juice|simple syrup|gren...
1,Blonde Redhead,0.03556,0.089224,0.004203,9.28,,,0.28,aperol|grapefruit juice|lemon juice|club soda
2,Trident,0.091026,0.033333,0.0,3.9,stirred,coupe,0.3,dry sherry|Cynar|aquavit|Fee Brothers peach bi...
3,Mojito,0.098765,0.077901,0.007407,6.075,shaken,highball,0.35,white rum|mint leaves|simple syrup|lime juice|...
4,Cricket Ball,0.099174,0.027548,0.0,4.5375,bubbly,flute,0.1,sugar|Peychaud's bitters|Fee Brothers rhubarb ...


In [34]:
def combine(row):
    return'|'.join([str(x).replace(' ', '|') for x in row[['name', 'style', 'glass', 'ingredients']]])

df['all_text'] = df.apply(lambda row: combine(row), axis=1)
df.head()

Unnamed: 0,name,alcohol,sugar,acid,oz,style,glass,melt,ingredients,all_text,x,y
0,Angostura Fizz,0.012531,0.075125,0.015161,3.99,fizz,coupe,0.33,Angostura bitters|lime juice|simple syrup|gren...,Angostura|Fizz|fizz|coupe|Angostura|bitters|li...,0.784468,0.11369
1,Blonde Redhead,0.03556,0.089224,0.004203,9.28,,,0.28,aperol|grapefruit juice|lemon juice|club soda,Blonde|Redhead|nan|nan|aperol|grapefruit|juice...,0.417615,0.700092
2,Trident,0.091026,0.033333,0.0,3.9,stirred,coupe,0.3,dry sherry|Cynar|aquavit|Fee Brothers peach bi...,Trident|stirred|coupe|dry|sherry|Cynar|aquavit...,-0.362929,-0.028648
3,Mojito,0.098765,0.077901,0.007407,6.075,shaken,highball,0.35,white rum|mint leaves|simple syrup|lime juice|...,Mojito|shaken|highball|white|rum|mint|leaves|s...,2.091919,-0.526404
4,Cricket Ball,0.099174,0.027548,0.0,4.5375,bubbly,flute,0.1,sugar|Peychaud's bitters|Fee Brothers rhubarb ...,Cricket|Ball|bubbly|flute|sugar|Peychaud's|bit...,-1.175427,0.385564


In [42]:
def cluster_XY(df, X, mode):
    """
    X {str} - must be 'all_text', 'ingredients' or 'prep'
    mode {str} - must be 'pca' or 'tsne' or 'mds'
    """
    
    
    def combine(row):
        if X == 'all_text':
            picks = ['name', 'style', 'glass', 'ingredients']
        elif X == 'ingredients':
            picks = ['ingredients']
        elif X == 'prep':
            picks = ['name', 'style', 'glass']
        else:
            raise ValueError("must be 'all_text', 'ingredients' or 'prep'")
        return'|'.join([str(x).replace(' ', '|') for x in row[picks]])

    df['features'] = df.apply(lambda row: combine(row), axis=1)

    try:
        docs = [c.replace('|', ' ') for c in list(df['features'])]
    except AttributeError:
        print("FAILURE: df['ingredients'] doesn't exist. Or something else went wrong.")
        return None
    #
    vectorizer = CountVectorizer(max_df=0.5, min_df=0)
    vectors = vectorizer.fit_transform(docs)
    print("vocab shape: {}".format(vectors.shape))
    X = vectors.toarray()

    if mode == 'pca':
        X_pca = PCA(n_components=2).fit(X).transform(X)
        XY = pd.DataFrame(X_pca, columns = ['x', 'y'])
        print("loaded PCA data")
    else:
        X_pca = PCA(n_components=50).fit(X).transform(X)
        if mode == 'tsne':
            X_tsne = TSNE().fit(X_pca)
            XY = pd.DataFrame(X_tsne.embedding_, columns=['x', 'y'])
            print("loaded t-sne data")
        elif mode == 'mds':
            dists = pdist(X_pca, cosine)
            dist_matrix = pd.DataFrame(squareform(dists), 
                                   columns=df['name'], 
                                   index=df['name'])

            scaler = MDS(dissimilarity='precomputed', random_state=123)
            XY = pd.DataFrame(scaler.fit_transform(dist_matrix))
            XY.rename(index=str, columns={0:'x', 1:'y'}, inplace=True)
            print("loaded MDS data")
        else:
            print("mode must be one of 'pca' or 'tsne' or 'mds'")
            return None
    #
    df['x'] = list(XY['x'])
    df['y'] = list(XY['y'])
    return df

# only ingredients

In [32]:
X = 'ingredients'

In [44]:
pick = 'pca'
transform_df = cluster_XY(df, X, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-{}.csv'.format(pick, X)), index=False)
#transform_df.head()

vocab shape: (103, 256)
loaded PCA data


In [45]:
pick = 'tsne'
transform_df = cluster_XY(df, X, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-{}.csv'.format(pick, X)), index=False)
#transform_df.head()

vocab shape: (103, 256)
loaded t-sne data


In [46]:
pick = 'mds'
transform_df = cluster_XY(df, X, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-{}.csv'.format(pick, X)), index=False)
#transform_df.head()

vocab shape: (103, 256)
loaded MDS data


# all_text

In [48]:
X = 'all_text'

In [49]:
pick = 'pca'
transform_df = cluster_XY(df, X, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-{}.csv'.format(pick, X)), index=False)
#transform_df.head()

vocab shape: (103, 256)
loaded PCA data


In [50]:
pick = 'tsne'
transform_df = cluster_XY(df, X, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-{}.csv'.format(pick, X)), index=False)
#transform_df.head()

vocab shape: (103, 256)
loaded t-sne data


In [51]:
pick = 'mds'
transform_df = cluster_XY(df, X, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-{}.csv'.format(pick, X)), index=False)
#transform_df.head()

vocab shape: (103, 256)
loaded MDS data


# prep

In [52]:
X = 'prep'

In [53]:
pick = 'pca'
transform_df = cluster_XY(df, X, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-{}.csv'.format(pick, X)), index=False)
#transform_df.head()

vocab shape: (103, 155)
loaded PCA data


In [54]:
pick = 'tsne'
transform_df = cluster_XY(df, X, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-{}.csv'.format(pick, X)), index=False)
#transform_df.head()

vocab shape: (103, 155)
loaded t-sne data


In [55]:
pick = 'mds'
transform_df = cluster_XY(df, X, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-{}.csv'.format(pick, X)), index=False)
#transform_df.head()

vocab shape: (103, 155)
loaded MDS data


In [30]:
docs = [c.replace('|', ' ') for c in list(df['all_text'])]
docs[0]

'Angostura Fizz fizz coupe Angostura bitters lime juice simple syrup grenadine heavy cream egg white soda water'

In [None]:
#
vectorizer = CountVectorizer(max_df=0.5, min_df=0)
vectors = vectorizer.fit_transform(docs)
print("vocab shape: {}".format(vectors.shape))
X = vectors.toarray()
return X

In [24]:
def cluster_all_XY(df, mode):
    """mode must be 'pca' or 'tsne' or 'mds'
    """
    try:
        docs = [c.replace('|', ' ') for c in list(df['ingredients'])]
    except AttributeError:
        print("FAILURE: df['ingredients'] doesn't exist. Or something else went wrong.")
        return None
    #
    vectorizer = CountVectorizer(max_df=0.5, min_df=0)
    vectors = vectorizer.fit_transform(docs)
    print("vocab shape: {}".format(vectors.shape))
    X = vectors.toarray()

    if mode == 'pca':
        X_pca = PCA(n_components=2).fit(X).transform(X)
        XY = pd.DataFrame(X_pca, columns = ['x', 'y'])
        print("loaded PCA data")
    else:
        X_pca = PCA(n_components=50).fit(X).transform(X)
        if mode == 'tsne':
            X_tsne = TSNE().fit(X_pca)
            XY = pd.DataFrame(X_tsne.embedding_, columns=['x', 'y'])
            print("loaded t-sne data")
        elif mode == 'mds':
            dists = pdist(X_pca, cosine)
            dist_matrix = pd.DataFrame(squareform(dists), 
                                   columns=df['name'], 
                                   index=df['name'])

            scaler = MDS(dissimilarity='precomputed', random_state=123)
            XY = pd.DataFrame(scaler.fit_transform(dist_matrix))
            XY.rename(index=str, columns={0:'x', 1:'y'}, inplace=True)
            print("loaded MDS data")
        else:
            print("mode must be one of 'pca' or 'tsne' or 'mds'")
            return None
    #
    df['x'] = list(XY['x'])
    df['y'] = list(XY['y'])
    return df

In [18]:
pick = 'pca'
transform_df = cluster_all_XY(df, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-all.csv'.format(pick)), index=False)
print(transform_df.columns)
transform_df.head()

vocab shape: (103, 122)
loaded PCA data
Index(['name', 'alcohol', 'sugar', 'acid', 'oz', 'style', 'glass', 'melt',
       'ingredients', 'x', 'y'],
      dtype='object')


Unnamed: 0,name,alcohol,sugar,acid,oz,style,glass,melt,ingredients,x,y
0,Angostura Fizz,0.012531,0.075125,0.015161,3.99,fizz,coupe,0.33,Angostura bitters|lime juice|simple syrup|gren...,0.784468,0.11369
1,Blonde Redhead,0.03556,0.089224,0.004203,9.28,,,0.28,aperol|grapefruit juice|lemon juice|club soda,0.417615,0.700092
2,Trident,0.091026,0.033333,0.0,3.9,stirred,coupe,0.3,dry sherry|Cynar|aquavit|Fee Brothers peach bi...,-0.362929,-0.028648
3,Mojito,0.098765,0.077901,0.007407,6.075,shaken,highball,0.35,white rum|mint leaves|simple syrup|lime juice|...,2.091919,-0.526404
4,Cricket Ball,0.099174,0.027548,0.0,4.5375,bubbly,flute,0.1,sugar|Peychaud's bitters|Fee Brothers rhubarb ...,-1.175427,0.385564


In [20]:
pick = 'tsne'
transform_df = cluster_all_XY(df, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-all.csv'.format(pick)), index=False)
print(transform_df.columns)
transform_df.head()

vocab shape: (103, 122)
loaded t-sne data
Index(['name', 'alcohol', 'sugar', 'acid', 'oz', 'style', 'glass', 'melt',
       'ingredients', 'x', 'y'],
      dtype='object')


Unnamed: 0,name,alcohol,sugar,acid,oz,style,glass,melt,ingredients,x,y
0,Angostura Fizz,0.012531,0.075125,0.015161,3.99,fizz,coupe,0.33,Angostura bitters|lime juice|simple syrup|gren...,9.198922,-36.22045
1,Blonde Redhead,0.03556,0.089224,0.004203,9.28,,,0.28,aperol|grapefruit juice|lemon juice|club soda,61.481007,-38.168635
2,Trident,0.091026,0.033333,0.0,3.9,stirred,coupe,0.3,dry sherry|Cynar|aquavit|Fee Brothers peach bi...,-52.318292,70.429747
3,Mojito,0.098765,0.077901,0.007407,6.075,shaken,highball,0.35,white rum|mint leaves|simple syrup|lime juice|...,-10.58989,2.765133
4,Cricket Ball,0.099174,0.027548,0.0,4.5375,bubbly,flute,0.1,sugar|Peychaud's bitters|Fee Brothers rhubarb ...,42.484675,11.122907


In [26]:
pick = 'mds'
transform_df = cluster_all_XY(df, pick)
transform_df.to_csv(orig_data.replace('data/', 'data/clustering/').replace('.csv', '-{}-all.csv'.format(pick)), index=False)
print(transform_df.columns)
transform_df.head()

vocab shape: (103, 122)
loaded MDS data
Index(['name', 'alcohol', 'sugar', 'acid', 'oz', 'style', 'glass', 'melt',
       'ingredients', 'x', 'y'],
      dtype='object')


Unnamed: 0,name,alcohol,sugar,acid,oz,style,glass,melt,ingredients,x,y
0,Angostura Fizz,0.012531,0.075125,0.015161,3.99,fizz,coupe,0.33,Angostura bitters|lime juice|simple syrup|gren...,0.761187,-0.070207
1,Blonde Redhead,0.03556,0.089224,0.004203,9.28,,,0.28,aperol|grapefruit juice|lemon juice|club soda,0.040795,-0.359459
2,Trident,0.091026,0.033333,0.0,3.9,stirred,coupe,0.3,dry sherry|Cynar|aquavit|Fee Brothers peach bi...,-0.100427,0.259957
3,Mojito,0.098765,0.077901,0.007407,6.075,shaken,highball,0.35,white rum|mint leaves|simple syrup|lime juice|...,0.201327,-0.791106
4,Cricket Ball,0.099174,0.027548,0.0,4.5375,bubbly,flute,0.1,sugar|Peychaud's bitters|Fee Brothers rhubarb ...,-0.131905,0.473933
