# Preprocessing Scotch Notes

In [29]:
import pandas as pd
import numpy as np
from hashlib import md5

## Adding IDs and removing duplicates

In [30]:
df = pd.read_csv("scotch.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Type,Name,Description,Nose,Palate,Finish,Price,Size,Abv,URL
0,0,blended malt scotch,Monkey Shoulder Blended Malt Scotch Whisky,Monkey Shoulder Scotch is a superb blended mal...,"An elegant, stylish nose of marmalade, Crema C...","Very malty, creamy delivery with a suggestion ...","Medium length, spicy oak and a hint of pepperm...",25.94,70.0,40.0,https://www.masterofmalt.com/whiskies/monkey-s...
1,1,blended malt scotch,Johnnie Walker Green Label 15 Year Old,"One of those harder-to-find whiskies, Johnnie ...",,,,38.95,70.0,43.0,https://www.masterofmalt.com/whiskies/johnnie-...
2,2,blended malt scotch,The Naked Grouse,An interesting addition to the Famous Grouse r...,"Smooth and oily with notes of cherry compote, ...","Sherried and thick with notes of sultanas, sti...","Medium, with notes of cocoa, oak and just a so...",26.49,70.0,40.0,https://www.masterofmalt.com/whiskies/naked-gr...
3,3,blended malt scotch,Scallywag,Big Peat's gone and got himself a trusty sidek...,Sweetness jumps up like an excited puppy. Icin...,"The sweetness surprisingly retreats, revealing...",A pinch of oak spice joins the vanilla and she...,38.75,70.0,46.0,https://www.masterofmalt.com/whiskies/douglas-...
4,4,blended malt scotch,Monkey Shoulder Smokey Monkey,A peaty variant of the excellent Monkey Should...,"Honeydew melon, flamed orange peel, a touch of...","Vanilla sits at the core, its earthy notes bol...",Toffee Crisp bars and the last wafts of drying...,27.44,70.0,40.0,https://www.masterofmalt.com/whiskies/monkey-s...


In [31]:
# Creating an ID for each whisky
def hashEl(name, url):
    """
    MD5 hash of Name and URL
    Hash each individually, use max/min functions to ensure hash 2 happens in same order irrespective of which get's input first.
    """
    h1 = md5(name.encode()).hexdigest()
    h2 = md5(url.encode()).hexdigest()
    h3 = max(h1, h2) + min(h1, h2)
    h4 = md5(h3.encode()).hexdigest()
    return h3

In [32]:
# Creating ID, dropping duplicates
df["ID"] = df.apply((lambda x: hashEl(x.Name, x.URL)), axis=1)
df = df.drop_duplicates(subset="ID", keep="last")
df= df.reset_index()
cols = ["ID", "Type", "Name", "Description", "Nose", "Palate", "Finish", "Price", "Size", "Abv","URL"]
df = df[cols]
df.head()


Unnamed: 0,ID,Type,Name,Description,Nose,Palate,Finish,Price,Size,Abv,URL
0,495334d7384f4c9a933a156cb57639770cd9c8bca00ac7...,blended malt scotch,Monkey Shoulder Blended Malt Scotch Whisky,Monkey Shoulder Scotch is a superb blended mal...,"An elegant, stylish nose of marmalade, Crema C...","Very malty, creamy delivery with a suggestion ...","Medium length, spicy oak and a hint of pepperm...",25.94,70.0,40.0,https://www.masterofmalt.com/whiskies/monkey-s...
1,e193fa8dee0bb9422054efd5dfb7f2c2628815243b584b...,blended malt scotch,Johnnie Walker Green Label 15 Year Old,"One of those harder-to-find whiskies, Johnnie ...",,,,38.95,70.0,43.0,https://www.masterofmalt.com/whiskies/johnnie-...
2,d3ba34da7b98276f2da2ab313ff9e6cdc5d476d253a05e...,blended malt scotch,The Naked Grouse,An interesting addition to the Famous Grouse r...,"Smooth and oily with notes of cherry compote, ...","Sherried and thick with notes of sultanas, sti...","Medium, with notes of cocoa, oak and just a so...",26.49,70.0,40.0,https://www.masterofmalt.com/whiskies/naked-gr...
3,b74f75c04d65218b3b094130583f53b58b58d331f47d44...,blended malt scotch,Scallywag,Big Peat's gone and got himself a trusty sidek...,Sweetness jumps up like an excited puppy. Icin...,"The sweetness surprisingly retreats, revealing...",A pinch of oak spice joins the vanilla and she...,38.75,70.0,46.0,https://www.masterofmalt.com/whiskies/douglas-...
4,a6f33f754e0dbb178b5fef3ff0d031f470918b9477703c...,blended malt scotch,Monkey Shoulder Smokey Monkey,A peaty variant of the excellent Monkey Should...,"Honeydew melon, flamed orange peel, a touch of...","Vanilla sits at the core, its earthy notes bol...",Toffee Crisp bars and the last wafts of drying...,27.44,70.0,40.0,https://www.masterofmalt.com/whiskies/monkey-s...


In [33]:
df.to_csv("scotch-no-dupes.csv", index=False)

## Extracting to graph
An issue with whisky tasting notes, is each document usually doesn't consist of one word more than once.  Means traditional processing techniques don't necessarily extract the right keywords.
We create a network graph of keywords, where each word is a node, with edges being co-occurences in individual tasting notes, with each edge weighted by number of co-occurences.

Another issue, is that words have different meanings, such as `peat`.  In normal english, peat is a fuel, and a synonym might be grass, in whisky peat and grass are about as far from each other as can be possible.

A wordnet lemmatizer fails to lemmatize many words - need to build own lemmatizer.

### Functions for making corpus

In [34]:
# Make Corpus Function

# Removing Punctuation
import string
punct = string.punctuation+'’'

def makeCorpus(lst):
    out = ''
    for el in lst:
        out = out + el + '  '
    return out

def makeList(df, col):
    """
    Extracts 
    """
    out = []
    for row in range(len(df.index)-1):
        # Extracting cell
        row_str = df[col][row].lower()
        # Removing punctuation
        row_str = row_str.translate(str.maketrans(' ',' ',punct))
        out.append(row_str)
    return out

### Lemmatizer

In [35]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

# Extracting stop words
from nltk.corpus import stopwords
whisky_stopwords = ["nose", "palate", "finish", "doesnt", "eye", "touch", "note", "hint", "good","linger","lingers","alongside","mar"]
swords = set(stopwords.words("english") + whisky_stopwords)


class WhiskyLemmatizer(WordNetLemmatizer):
    '''
    An extension on the WordNet Lemmatizer with added context for whisky
    '''
        
    def __init__(self):
        self.whisky_words = {
            "peated": "peat",
            "peaty": "peat",
            "smokey": "smoke",
            "smoky": "smoke",
            "sherried": "sherry"
    }
    
    def lemmatize(self, word):
        # Caches lemmatized words to avoid lookups
        if word in self.whisky_words:
            out = self.whisky_words[word]
        else:
            tag = self.tag(word)
            out = super().lemmatize(word, pos=tag)
            self.whisky_words[word] = out
        return out
    
    def lemmatizeList(self, lst):
        return [self.lemmatize(w) for w in lst]
    
    def whiskySub(self, word):
        if word in self.whisky_words:
            return self.whisky_words[word]
        else:
            return word
    
    def tag(self, word):
        tag = pos_tag([word])
        tag = pos_tag([word])[0][1][0].lower()
        if tag == "v":
            return "v"
        if tag == "j":
            return "a"
        else:
            return "n"
        
        
    def __repr__(self):
        return "<WhiskyLemmatizer>"
    
lemmatizer = WhiskyLemmatizer()

def tokenFilter(corpus):
    tokens = lemmatizer.lemmatizeList(word_tokenize(corpus))
    filtered = [w for w in tokens if (not w in swords)]
    return filtered


### Graph making functions

In [36]:
def makeNodes(corpus):
    filtered = tokenFilter(corpus)
    nodes_dict = {}
    for word in filtered:
        if word in nodes_dict:
            nodes_dict[word] += 1
        else:
            nodes_dict[word] = 1
    
    node_names = list(nodes_dict.keys())
    nodes = []
    for name in node_names:
        nodes.append({
            "name":name,
            "degree":int(nodes_dict[name])
        })
    return nodes, node_names
        

def incrementEdge(edges, from_idx, to_idx):
    from_s, to_s = str(from_idx), str(to_idx)
    if from_s in edges:
        if to_s in edges[from_s]:
            edges[from_s][to_s] += 1
        else:
            edges[from_s][to_s] = 1
    else:
        edges[from_s] = {to_s : 1}
    return


def makeNoteEdges(note, nodes, edges):
    descs = tokenFilter(note)
    descs = [d for d in descs if d in nodes]
    node_idxs = [nodes.index(w) for w in descs if w in nodes]
    n_descs = len(descs)
    for note_idx1 in range(n_descs - 1):
        for note_idx2 in range(note_idx1+1, n_descs):
            if note_idx1 != note_idx2:
                node1 = node_idxs[note_idx1]
                node2 = node_idxs[note_idx2]
                from_idx = min(node1, node2)
                to_idx = max(node1, node2)
                incrementEdge(edges, from_idx, to_idx)
    return

        
def initialMakeEdges(lst, nodes):
    edges = {}
    for note in lst:
        makeNoteEdges(note, nodes, edges)
    return edges

def makeEdges(lst, names, verbose=False):
    init_edges = initialMakeEdges(lst, names)
    edges = []
    for start in init_edges.keys():
        for end in init_edges[start].keys():
            start_int, end_int = int(start), int(end)
            edge = {
                "from": start_int,
                "to": end_int,
                "weight": init_edges[start][end],
            }
            if verbose:
                # Add english description to edge.
                desc = {
                    "from": names[start_int],
                    "to": names[end_int]
                }
                edge["english"] = desc
            edges.append(edge)
    return edges
    

def makeGraph(corpus_list, verbose_edges=False, ):
    corpus = makeCorpus(corpus_list)
    
    nodes, names = makeNodes(corpus)
    edges = makeEdges(corpus_list, names, verbose_edges)
    
    graph = {
        "nodes": nodes,
        "edges": edges,
        "node-names": names
    }
    return graph

### Graph anal functions

In [37]:
def sortNodesByDegree(graph):
    """
    Sorts nodes by degree : requires each node to know its degree.  
    This should be the case as its degree is included when making graph
    """
    # Extracting nodes to data frame
    nodes = pd.DataFrame(graph["nodes"])
    
    # Renaming columns and sorting graph
    nodes.columns = ["Descriptor", "Degree"]
    nodes = nodes.sort_values("Degree",ascending=False)
    nodes = nodes.reset_index()
    return nodes[["Descriptor", "Degree"]]

def getUnrepresentedCount(corpus_list, nodes):
    """
    Function to find all descriptors which aren't on graph
    """
    descriptors = list(nodes["Descriptor"])
    
    n_unrepresented = 0
    
    for t_note in corpus_list:
        desc = tokenFilter(t_note)
        matches = [w in descriptors for w in desc]
        if True not in matches:
            n_unrepresented += 1
    
    return n_unrepresented

### Functions to add words to lemmatizer


In [38]:
def addToLemmatizer(nodes, depth):
    """
    Adds a rough stemming of each of the first `depth` nodes to the lemmatizer.
    """
    # getting unlemmatized common words
    list_descriptors = list(nodes["Descriptor"])
    describers_d = list_descriptors[:depth]
    idx1 = 0
    while idx1 < len(describers_d):
        idx2 = idx1 + 1
        word1 = describers_d[idx1]
        while idx2 < len(describers_d):
            word2 = describers_d[idx2]
            if word2[:len(word1)] == word1:
                describers_d.pop(idx2)
                break
            idx2 += 1
        idx1 += 1


    for word in describers_d:
        unlemma = [w for w in list_descriptors if w[:len(word)]==word]
        for unlemma_word in unlemma:
            lemmatizer.whisky_words[unlemma_word] = word

## Tasting Note Analysis
We follow the following process to analyse the tasting notes:

- Make preliminary graph based on hardcoded lemmatizer

- Based on degree, take each term and add a rough stem to the lemmatizer

- Remake the graph with improved lemmatizer

### Tasting Note Analysis : Nose

In [39]:
# Extracting list of tasting notes from word
nose_list = makeList(df.dropna().reset_index(), "Nose")

# Making a preliminary graph and extracting words
nose_graph_prelim = makeGraph(nose_list, verbose_edges=True)
nose_deg = sortNodesByDegree(nose_graph_prelim)

In [40]:
## Noticed that when using len_deg we got some intersting 
## results - instead looking at short words, all seem 
## weird - adding to stopwords
for word in list(nose_deg["Descriptor"]):
    if len(word) < 3:
        print(word)
        
for word in list(nose_deg["Descriptor"]):
    if len(word) < 3:
        swords.add(word)
        
# Improving lemmatizer based on 500 stemmed common words
addToLemmatizer(nose_deg, 200)

–
au
de
le
go
px
u
se
9
‘
18
ol
10
ba
oh
ii
15
n
12
30


In [41]:
# Remaking nose graph
nose_graph = makeGraph(nose_list, verbose_edges=True)
nose_degree = sortNodesByDegree(nose_graph)

Now that we have a list of lemmatized nose descriptors (minus stopwords), we can apply the same processs to palate and finish
### Tasting Note Analysis : Palate

In [42]:
# Extracting list of tasting notes from word
palate_list = makeList(df.dropna().reset_index(), "Palate")

# Making a preliminary graph and extracting words
palate_graph_prelim = makeGraph(palate_list, verbose_edges=True)
palate_deg = sortNodesByDegree(palate_graph_prelim)

# Improving lemmatizer based on 500 stemmed common words
addToLemmatizer(palate_deg, 200)

In [43]:
for word in list(palate_deg["Descriptor"]):
    if len(word) < 3:
        print(word)
for word in list(palate_deg["Descriptor"]):
    if len(word) < 3:
        swords.add(word)

mr
8
90
42
ra
el
“
”


In [44]:
# Remaking palate graph
palate_graph = makeGraph(palate_list, verbose_edges=True)
palate_degree = sortNodesByDegree(palate_graph)

### Tasting Note Analysis : Finish

In [45]:
# Extracting list of tasting notes from word
finish_list = makeList(df.dropna().reset_index(), "Finish")

# Making a preliminary graph and extracting words
finish_graph_prelim = makeGraph(finish_list, verbose_edges=True)
finish_deg = sortNodesByDegree(finish_graph_prelim)

In [46]:
## Noticed that when using len_deg we got some intersting 
## results - instead looking at short words, all seem 
## weird - adding to stopwords
for word in list(finish_deg["Descriptor"]):
    if len(word) < 3:
        print(word)

In [47]:
addToLemmatizer(finish_deg, 200)

In [48]:
# Remaking finish graph
finish_graph = makeGraph(finish_list, verbose_edges=True)
finish_degree = sortNodesByDegree(finish_graph)

### Saving lemmatizer dictionary to json


In [49]:
import json
ldict = lemmatizer.whisky_words
ldict["pepper"] = "pepper"
ldict["peppermint"] = "peppermint"
lemmatizer.whisky_words = ldict
ldict
with open("whiskynlp/whisky_lemmatizer_dict.json", "w") as out:
    json.dump(ldict, out)

In [50]:
swords_l = list(swords)
swords_j = {"swords":swords_l}
with open("whiskynlp/stopwords.json", "w") as out:
    json.dump(swords_j, out)

### Tasting Note Analysis : Remaking graphs with updated lemmatizer

In [51]:
# Remaking nose graph
nose_graph = makeGraph(nose_list, verbose_edges=True)
nose_degree = sortNodesByDegree(nose_graph)

# Remaking palate graph
palate_graph = makeGraph(palate_list, verbose_edges=True)
palate_degree = sortNodesByDegree(palate_graph)

# Remaking finish graph
finish_graph = makeGraph(finish_list, verbose_edges=True)
finish_degree = sortNodesByDegree(finish_graph)

In [52]:
cuts = np.arange(10, 160, 10)

len_nose = len(nose_list)
len_palate = len(palate_list)
len_finish = len(finish_list)

for n in cuts:
    nose_val = getUnrepresentedCount(nose_list, nose_degree[:n])
    palate_val = getUnrepresentedCount(palate_list, palate_degree[:n])
    finish_val = getUnrepresentedCount(finish_list, finish_degree[:n])
    print(f"N of Words: {n}")
    print(f"Nose : {nose_val} unrepresented ({100 * nose_val / len_nose}%)")
    print(f"Palate : {palate_val} unrepresented ({100 * palate_val / len_palate}%)")
    print(f"Finish : {finish_val} unrepresented ({100 * finish_val / len_finish}%)")
    print()

N of Words: 10
Nose : 920 unrepresented (18.81775414195132%)
Palate : 752 unrepresented (15.381468602986295%)
Finish : 1628 unrepresented (33.29924319901821%)

N of Words: 20
Nose : 280 unrepresented (5.727142564941706%)
Palate : 196 unrepresented (4.008999795459194%)
Finish : 733 unrepresented (14.992841071793823%)

N of Words: 30
Nose : 128 unrepresented (2.618122315401923%)
Palate : 77 unrepresented (1.5749642053589692%)
Finish : 457 unrepresented (9.347514829208427%)

N of Words: 40
Nose : 62 unrepresented (1.2681529965228062%)
Palate : 45 unrepresented (0.9204336265084885%)
Finish : 310 unrepresented (6.340764982614031%)

N of Words: 50
Nose : 30 unrepresented (0.6136224176723256%)
Palate : 26 unrepresented (0.5318060953160155%)
Finish : 212 unrepresented (4.336265084884435%)

N of Words: 60
Nose : 21 unrepresented (0.42953569237062794%)
Palate : 21 unrepresented (0.42953569237062794%)
Finish : 164 unrepresented (3.3544692166087136%)

N of Words: 70
Nose : 15 unrepresented (0.3068

Based on a quick look, 100 words for each seems to get a very good coverage of dataset

## Building a vectoriser

In [53]:
from whiskynlp.WhiskyLemmatizer import WhiskyLemmatizer

lsta = nose_list[:6]
features = ["marmalade", "ginger", "malt", "peated"]

class ListFeatureVectorizer:
    def __init__(self, features):
        self.Lemmatizer = WhiskyLemmatizer()
        self.features = self.Lemmatizer.lemmatizeList(features)
    
    def fit(self, input_list):
        list_vec = []
        
        
        for doc in input_list:
            tokenized = self.Lemmatizer.tokenFilter(doc)
            list_vec.append(
                [tokenized.count(w) for w in self.features]
            )
            
        outdf = pd.DataFrame(list_vec, columns=self.features)
        return outdf
    
    def __repr__(self):
        return "<ListFeatureVectorizer>"

In [54]:
lfv = ListFeatureVectorizer(features)
lfv.fit(lsta)

Unnamed: 0,marmalade,ginger,malt,peat
0,1,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,0,0,0
4,0,0,0,0
5,0,0,0,1


## Naive HCA Clustering with little preprocessing beyond vectorising
Naive exploration.  Will amalgamate nose, palate and finish as one.

In [55]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
# Getting features - 207 features
n = 150

nose_features = list(nose_degree["Descriptor"])
palate_features = list(palate_degree["Descriptor"])
finish_features = list(finish_degree["Descriptor"])

naive_features = list(
    set(
        nose_features[:n] +
        palate_features[:n] +
        finish_features[:n]

    )
)

In [56]:
# Processing data frame to get nose, palate and finish all in one column.
df["All"] = df.Nose + df.Palate + df.Finish
n_df = df.dropna(subset=["All"]).reset_index()
all_notes = makeList(n_df, "All")


In [57]:
naive_lfv = ListFeatureVectorizer(naive_features)
vectorised = naive_lfv.fit(all_notes)

In [58]:
vectorised["ID"] = n_df.ID
vectorised["Name"] = n_df.Name
vectorised_features = vectorised.drop(["ID","Name"], axis=1)

In [None]:
hca =  AgglomerativeClustering(
    n_clusters=12
   
)
hca.fit(vectorised_features)

In [None]:
hca.labels_

In [None]:
vectorised["HCA"] = hca.labels_

In [None]:
vectorised.head()

In [None]:
def bottleDetailsFromURL(URL, alg):
    """
    Function for pure laziness... Because apparently I can't remember an md5 hash off the top of my head...
    """
    bottle_id = n_df[n_df["URL"] == URL]["ID"].values[0]
    name = n_df[n_df["URL"] == URL]["Name"].values[0]
    cluster = vectorised[vectorised.ID == bottle_id][alg].values[0]
    out = {
        "id":bottle_id,
        "name":name,
        "cluster":cluster
    }
    return out

def getClusterNames(cluster, alg):
    return list(vectorised[vectorised[alg] == cluster].Name)

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/laphroaig-10-year-old-whisky/","HCA")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/ardbeg/ardbeg-10-year-old-whisky/","HCA")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/highland-park/highland-park-12-year-old-viking-honour-whisky/","HCA")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/springbank/springbank-12-year-old-cask-strength-56-3-whisky/","HCA")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/talisker/talisker-10-year-old-whisky/","HCA")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/talisker/talisker-storm-whisky/","HCA")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/the-macallan-12-year-old-sherry-oak-whisky/","HCA")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/glenfiddich-12-year-old-whisky/","HCA")

In [None]:
getClusterNames(1,"HCA")

In [None]:
getClusterNames(5,"HCA")

## Naive kmeans Clustering with little preprocessing beyond vectorising
Naive exploration.  Will amalgamate nose, palate and finish as one.

In [None]:
from sklearn.cluster import KMeans
kmeans =  KMeans(
    n_clusters=8,
    verbose=1,
    n_init=100,
    max_iter=1000
)
kmeans.fit(vectorised_features)

In [None]:
vectorised["kmeans"] = kmeans.labels_
vectorised.head()

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/laphroaig-10-year-old-whisky/","kmeans")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/ardbeg/ardbeg-10-year-old-whisky/","kmeans")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/talisker/talisker-10-year-old-whisky/","kmeans")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/talisker/talisker-skye-whisky/", "kmeans")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/talisker/talisker-storm-whisky/", "kmeans")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/springbank/springbank-12-year-old-cask-strength-56-3-whisky/","kmeans")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/highland-park/highland-park-12-year-old-viking-honour-whisky/","kmeans")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/the-macallan-12-year-old-sherry-oak-whisky/","kmeans")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/glenfiddich-12-year-old-whisky/","kmeans")

In [None]:
bottleDetailsFromURL("https://www.masterofmalt.com/whiskies/balvenie/balvenie-doublewood-12-year-old-whisky/","kmeans")

In [None]:
hca_islay = getClusterNames(1,"HCA")
km_islay = getClusterNames(6,"kmeans")
print(f"Smoky HCA : {len(hca_islay)}")
print(f"Smoky km : {len(km_islay)}")

In [None]:
km_islay

In [None]:
getClusterNames(0,"kmeans")

## Attempt at Elbow Method optimising k-means

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='darkgrid')

## Gaussian Mixture

In [None]:
from sklearn.cluster import KMeans
kmeans =  KMeans(
    n_clusters=8,
    verbose=1,
    n_init=100,
    max_iter=1000
)
kmeans.fit(vectorised_features)
vectorised["kmeans"] = kmeans.labels_
vectorised.head()