# Clustering

How can we "cluster" texts together based on similar vocabulary, or other features we calculate?

## Preliminaries

In [None]:
# import some things
import os
import pandas as pd
from textblob import TextBlob
from matplotlib import pyplot as plt
pd.set_option("display.max_rows", 20)

In [None]:
# Set text folder and metadata path

text_folder = '../corpora/tropic_of_orange/texts'
path_to_metadata='../corpora/tropic_of_orange/metadata.xls'

In [None]:
# Load metadata
df_meta = pd.read_excel(path_to_metadata)
df_meta

In [None]:
# create a label column
labels = []
for index,row in df_meta.iterrows():
    first_name_of_narrator = row['narrator'].split()[0]
    label = first_name_of_narrator+' ('+str(row['chapter'])+')'
    labels.append(label)
df_meta['label']=labels
df_meta

In [None]:
# Let's also set the 'label' as the index
df_meta=df_meta.set_index('label',drop=False)    # drop=False means that 'label' is preserved as a column, as well as acting as the new index
df_meta

In [None]:
# A function to make a document-term matrix
# FROM a df_meta object

def make_dtm_from_df(df_meta,n_top_words=1000,normalize=True,filename_col='fn',no_cap_words=True,exclude_words=[]):
    # get stopwords
    stopwords=exclude_words
    stopwords=set(stopwords)

    # make an empty results list
    all_results = []

    # make a count for all words
    from collections import Counter
    all_counts = Counter()

    # for each filename
    for i,fn in enumerate(df_meta[filename_col]):
        if not i%10: print('>> looping through #',i,'of',len(df_meta),'files:',fn)
        # make sure is a text file
        if not fn.endswith('.txt'): continue
        
        # full path
        full_path = os.path.join(text_folder,fn)

        # open the file
        with open(full_path) as file:
            txt=file.read()

        # make a blob
        blob = TextBlob(txt)

        # make a result dictionary
        text_result = {}

        # set the filename and index
        text_result['fn']=fn
        text_result['index']=df_meta.index[i]

        # loop over the word counts
        num_words = len(blob.words)
        
        from collections import Counter
        word_counts = Counter(blob.words)

        # for each word,count pair in the blob.word_counts dictionary...
        for word,count in word_counts.items():
            # is the word in the stopwords?
            if word in stopwords: continue  
                
            # skip capitalized words?
            if no_cap_words and word!=word.lower(): continue
                
            # lowercase word
            word = word.lower()

            # is the word a punctuation?
            if not word[0].isalpha(): continue
            
            # set the normalized version
            if normalize:
                # get the term frequency (count divided by number of words)
                tf = count / num_words

                # set the term frequency result to the key 'word' in the text_result dictionary
                text_result[word] = tf
            else:
                # set the count as a result
                text_result[word] = count

            # add the count to the dictionary of counts for all words
            all_counts[word]+=count

        # add results
        all_results.append(text_result)
    
    # Get the most frequent words
    most_common_words_plus_counts = all_counts.most_common(n_top_words)
    
    # Get only the words
    word_columns = []
    for word,count in most_common_words_plus_counts:
        word_columns.append(word)
        
    # Words used as columns
    print('>> top',n_top_words,'words:',word_columns)
    
    # Get columns
    columns=[]
    #columns.append('fn')
    columns.append('index')
    columns.extend(word_columns)
    
    # Make dataframe
    df = pd.DataFrame(all_results, columns=columns).set_index('index').fillna(0) * 1000
    
    # return dataframe
    return df

In [None]:
# get stopwords
from nltk.corpus import stopwords
stopword_list=stopwords.words('english')

In [None]:
# add anything to stopwords?
stopword_list.append('us')

In [None]:
# Make the document term matrix
dtm = make_dtm_from_df(df_meta,normalize=True,n_top_words=500,exclude_words=stopword_list,no_cap_words=True)

In [None]:
dtm

In [None]:
# Merge with metadata
dtm_meta = df_meta.join(dtm,lsuffix='_meta')           # join because both indices are identical
dtm_meta

## Distance matrix

We can think about "distance" between documents in the DTM space.

### 2-D distance

In [None]:
# Source: https://stackoverflow.com/questions/15910019/annotate-data-points-while-plotting-from-pandas-dataframe/15911372#15911372

def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x'], point['y'], str(point['val']))

In [None]:
def biplot(df, x_col, y_col, label_col=None):
    max_x=max(df[x_col])
    max_y=max(df[y_col])
    ax = df.plot(x=x_col,y=y_col,kind='scatter',xlim=(0,max_x),ylim=(0,max_y),figsize=(10,10))
    if label_col:
        label_point(df[x_col], df[y_col], df[label_col], ax)

In [None]:
biplot(dtm_meta,'said','knew','label')

In [None]:
def biplot_groups(df, x_col, y_col, group_col=None, label_col=None, figsize=(10,10)):
    max_x=max(df[x_col])
    max_y=max(df[y_col])
    
    groups = df.groupby(group_col)

    # Plot
    fig, ax = plt.subplots(figsize=figsize)
    ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
    for name, group in groups:
        ax.plot(group[x_col], group[y_col], marker='o', linestyle='', ms=8, label=name)
        label_point(group[x_col], group[y_col], group[label_col], ax)
        #ax.plot(x=group[x_col],y=group[y_col],kind='scatter',xlim=(0,max_x),ylim=(0,max_y),label="hello")
    ax.legend()
    plt.xlabel(x_col, fontsize=16)
    plt.ylabel(y_col, fontsize=16)

    plt.show()
    

In [None]:
biplot_groups(dtm_meta, 'said', 'knew', 'narrator', 'label')

In [None]:
dtm_2d = dtm[['said','knew']]
dtm_2d

In [None]:
from scipy.spatial.distance import squareform, pdist
from scipy.stats import zscore

In [None]:
# Getting distances:
# for more info on pdist: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html

#pdist(dtm)
#squareform(pdist(dtm))
#pd.DataFrame(squareform(pdist(dtm)))
#pd.DataFrame(squareform(pdist(dtm)), columns=dtm.index, index=dtm.index)

In [None]:
def make_dist(X_dtm,dist_metric='euclidean',standardize=False):
    distmatrix=pdist(X_dtm,metric=dist_metric)
    return pd.DataFrame(squareform(distmatrix), columns=X_dtm.index, index=X_dtm.index)

In [None]:
dtm_2d_dist = make_dist(dtm_2d)

In [None]:
dtm_2d_dist['Rafaela (1)'].sort_values()

### N-D Distance

In [None]:
# How many dimensions?
len(dtm.columns)

In [None]:
# Distance matrix

dtm_dist = make_dist(dtm)
dtm_dist

In [None]:
dtm_dist['Rafaela (1)'].sort_values()

In [None]:
dtm_dist['Bobby (2)'].sort_values()

In [None]:
dtm_dist['Manzanar (19)'].sort_values()

In [None]:
dtm_dist['Manzanar (5)'].sort_values()

## Clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
# Here it is: in a single line, compute a hierarchical clustering of the DTM

hclust = linkage(dtm,method='complete')
hclust

In [None]:
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

In [None]:
def plot_dendrogram(dtm,linkage_method='complete'):
    hclust = linkage(dtm,method=linkage_method)
    
    fig, ax = plt.subplots(figsize=(20, 8))
    plt.title('Hierarchical Clustering Dendrogram (truncated)')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    fancy_dendrogram(
        hclust,
        show_leaf_counts=False,  # otherwise numbers in brackets are counts
        leaf_rotation=90.,
        leaf_font_size=12.,
        show_contracted=True,  # to get a distribution impression in truncated branches
        labels=dtm.index,
    )
    plt.savefig('hclust_dendrogram.pdf')
    plt.show()


In [None]:
plot_dendrogram(dtm)

In [None]:
# TFIDF function
def to_tfidf(dtm):
    import numpy as np, pandas as pd
    # list of dictionaries
    dtm_tfidf = pd.DataFrame()
    
    for word in dtm.columns:
        # tf
        tf_series = dtm[word]
        
        # idf
        num_docs = len(dtm)
        num_docs_with_word=len(dtm[dtm[word]>0])
        idf=np.log(num_docs/num_docs_with_word)
        
        # tfidf
        tfidf_series = tf_series * idf
        dtm_tfidf[word]=tfidf_series
    
    return dtm_tfidf

In [None]:
dtm_tfidf = to_tfidf(dtm)
dtm_tfidf

In [None]:
dtm_tfidf.loc['Manzanar (46)'].nlargest(20)

In [None]:
dtm_tfidf.loc['Arcangel (36)'].nlargest(20)

In [None]:
dtm_tfidf.loc['Bobby (49)'].nlargest(20)

In [None]:
dtm_tfidf.loc['Gabriel (6)'].nlargest(20)

In [None]:
dtm_tfidf.loc['Gabriel (17)'].nlargest(20)

## t-SNE

[t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) is a popular method of [dimensionality reduction](https://en.wikipedia.org/wiki/Dimensionality_reduction). For more information, [see here](https://www.datacamp.com/community/tutorials/introduction-t-sne).

In [None]:
def tsne(datadf,df_dist=None,n_components=2,resultdf=None):
    if df_dist is None: df_dist=make_dist(datadf)
    m_dist=df_dist.values
    from sklearn.manifold import TSNE
    model = TSNE(n_components=n_components, random_state=0)
    fit = model.fit_transform(m_dist)
    from collections import defaultdict
    newcols=defaultdict(list)
    for i,word in enumerate(datadf.index):
        for ii,xx in enumerate(fit[i]):
            newcols['tsne_V'+str(ii+1)] += [xx]
    if resultdf is None: resultdf=pd.DataFrame(index=datadf.index)
    for k,v in list(newcols.items()): resultdf[k]=v
    return resultdf

In [None]:
dtm_tsne = tsne(dtm)

In [None]:
dtm_tsne_meta = dtm_tsne.join(dtm_meta)

In [None]:
biplot_groups(dtm_tsne_meta, 'tsne_V1', 'tsne_V2', 'narrator', 'label')