# Text Clustering
In this document, I will cluster text data for violation from across many cities in the state to see if it creates a reasonable categories to use for analysis.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle

import spacy
import en_core_web_lg
from collections import Counter
import nltk
from nltk.corpus import stopwords
import string
import re

import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('precision', 4)
pd.options.display.max_seq_items = 100
pd.options.display.max_columns = 50
plt.style.use('fivethirtyeight')

  import pandas.util.testing as tm


In [2]:
with open ('DATA/text.txt', 'rb') as filepath:
    text_list = pickle.load(filepath)

In [3]:
len(text_list)

75996170

It's a huge dataset, we should be careful dealing with them.

In [3]:
df = pd.DataFrame(text_list, columns=['violations'])

## Text Preprocessing


In [12]:
pattern = "([a-zA-Z]+[\&[a-zA-Z]+]?)"
p = re.compile(pattern)

In [13]:
df['violations'] = df.violations.map(lambda x: ' '.join(p.findall(str(x).upper())))

In [14]:
# check for duplicates
df.drop_duplicates(inplace = True)

In [15]:
df.shape

(439438, 1)

That's substantially less, so that's great! Let's save them out.

In [16]:
with open("DATA/text_clean.txt", "wb") as filepath:
    pickle.dump(df.violations.values, filepath)

In [17]:
tmp = [x.lower() for x in list(df.violations)]

In [22]:
nlp = en_core_web_lg.load()

In [10]:
def replace_texts(list_of_text, replacement_dict):
    for i, text in enumerate(list_of_text):
        text = text.split()
        for j, te in enumerate(text):
            if te in replacement_dict: 
                text[j] = replacement_dict[te]
        list_of_text[i] = ' '.join(text)
    return list_of_text

In [11]:
def consolidate_words(textlist, 
                      n = 1, 
                      thresh = .8, 
                      nlp = None, 
                      replace = False,
                      remove = False, 
                      verbose = True):
    '''
    Takes a list of texts and change words that occur 
    n times or less to similar word in the text
    Return a new list or a replacement dictionary
    Requires Spacy
    
    Keyword arguments:
    textlist -- an array of texts
    n        -- int, maximum occurrence to replace or remove (default = 1)
    thresh   -- float (0 < thresh < 1), minimum similarity required to replace (default = .8)
    nlp      -- Spacy module, if none uses en_core_web_lg from Spacy (default = None)
    replace  -- if true, it returns a new textlist, or it returns a replacement dictionary
    remove   -- if true, it removes the low-occurrence words if there's no replacement
    verbose  -- if True, returns all word changes (default True))
    '''
    if nlp == None:
        nlp = en_core_web_lg.load()
    
    list_of_text = textlist.copy()
    
    wordcounts = Counter(' '.join(list_of_text).split())    
    low_words = [k for k, v in wordcounts.items() if v <= n]
    other_words = [k for k, v in wordcounts.items() if v > n]
    
    tokens = nlp(' '.join(other_words))
    
    replacement_dict = {}
    
    for word in low_words:

        word_token = nlp(word)
        max_similarity = thresh

        for tk in tokens:
            # find the maximum similarity above threshold
            sim_score = word_token.similarity(tk)

            if 1 > sim_score > max_similarity:
                replacement_dict[word] = tk.text
                max_similarity = sim_score
                            
        try:
            if verbose: 
                print(word, '->', replacement_dict[word])
            else: 
                continue
                
        except KeyError:
            if remove: 
                replacement_dict[word] = ''
            else:
                continue

    if replace:
        list_of_text = replace_texts(list_of_text, replacement_dict)
        return list_of_text
    
    else:
        return replacement_dict

In [None]:
violations = consolidate_words(tmp, n = 50, nlp = nlp, replace = True, remove = True)

In [14]:
with open("DATA/text_clean.txt", "wb") as filepath:
    pickle.dump(violations, filepath)

In [4]:
with open("DATA/text_clean.txt", "rb") as filepath:
    violations = pickle.load(filepath)

### Lemmatization

In [5]:
from nltk.stem import WordNetLemmatizer

In [6]:
# lemmatization
wnl = WordNetLemmatizer()

In [7]:
wordcounts = Counter(' '.join(violations).split())

In [8]:
replace_dict = {}
for k in wordcounts.keys():
    rep = wnl.lemmatize(k)
    if rep != k: 
        replace_dict[k] = wnl.lemmatize(k)

In [12]:
violations = replace_texts(violations, replace_dict)

### Unique words
Let's get a set of unique words and make some final edits.

In [13]:
wordcounts = Counter(' '.join(violations).split())

In [14]:
unique_words = list(wordcounts.keys())

### Stopwords Removal
Let's remove some stopwords.

In [15]:
from nltk.corpus import stopwords

In [16]:
sw = stopwords.words('English')

In [17]:
unique_words = [x for x in unique_words if x not in sw]

In [18]:
len(unique_words)

1673

## Clusterings
### Unique Words Clusterings
First I'll cluster words based on their similarity. First, let's get their word vectors.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

In [19]:
def get_vectors(word): 
    token = nlp(word)
    if np.sum(token.vector) == 0: 
        return np.nan
    return token.vector

In [20]:
unique_vecs = pd.DataFrame()

In [23]:
for word in unique_words:
    vecs = get_vectors(word)
    if vecs is not np.nan:
        unique_vecs[word] = vecs

In [24]:
unique_vecs = unique_vecs.transpose()

In [51]:
words = list(unique_vecs.index)
sc_vecs = StandardScaler().fit_transform(unique_vecs)

#### Hierarchical Agglomerative Clustering

In [92]:
n = 20

In [93]:
clusters = AgglomerativeClustering(n_clusters = n)
pred = clusters.fit_predict(unique_vecs)

In [94]:
result = pd.DataFrame(list(zip(words, pred)), columns = ['words', 'cluster'])

In [None]:
for i in range(n): 
    print("\n")
    print(f"Cluster {i}\n{result[result.cluster == i].words.values}")

Let's see how individual clusters are like.

Some clusters are not decipherable, let's correct them. Then we will cluster based on the average of each document.

In [101]:
result_a = result[result.cluster != 14]
result_a = result[result.cluster != 13]

In [105]:
result_a[result.cluster == 17].words.values

  """Entry point for launching an IPython kernel.


array(['susp', 'lic', 'fty', 'reqd', 'viol', 'prohib', 'veh', 'imp',
       'inj', 'stopsign', 'phys', 'ped', 'drv', 'bac', 'emerg', 'disp',
       'traf', 'loc', 'proj', 'laned', 'driv', 'agg', 'diplay',
       'pedestrains', 'regis', 'emer', 'respon', 'misd', 'oper', 'priv',
       'lft', 'vio', 'frt', 'cmv', 'csp', 'nontransparent', 'vin',
       'flagman', 'sched', 'maint', 'arrestable', 'reddi', 'fel', 'contr',
       'bef', 'dri', 'lts', 'opr', 'revok', 'yld', 'alch', 'pip', 'chg',
       'prov', 'fld', 'zon', 'bal', 'beh', 'resi', 'pol', 'pres', 'fac',
       'scen', 'unknow', 'hom', 'cty', 'pers', 'intersec', 'unattend',
       'consum', 'alc', 'obscur', 'rcpt', 'circum', 'owi', 'crd', 'cad',
       'ovr', 'spee', 'curent', 'lpl', 'trl', 'pbt', 'fatt', 'carless',
       'ratt', 'plt', 'ndl', 'prf', 'ckd', 'oc', 'indef', 'ife', 'coa',
       'vlt', 'lgt', 'asr', 'fao', 'rect', 'fst', 'ivc', 'engr',
       'hallucinogenics', 'cmb', 'protec', 'adrs'], dtype=object)

In [109]:
[x for x in result_a.words if x.startswith('stop')]

['stop', 'stopsign', 'stopping', 'stopped']

In [None]:
{'susp': 'suspend', 'viol': 'violation', 'prohib': 'prohibit', 'veh': 'vehicle', 'stopsign': 'stop', 'pedestrains': 'pedestrians', 
''}

### Document based clustering
Now I'll get the average vector of each sentence and cluster documents.

In [None]:
# todo - break it out 
violations_vecs = np.empty((300,1))

for docs in violations:
    vecs = get_vectors(docs)
    try:
        violations_vecs = np.concatenate((violations_vecs, vecs.reshape((300,1))), axis = 1)
    except AttributeError: 
        violations_vecs = np.concatenate((violations_vecs, np.zeros((300,1))), axis = 1)
violations_vecs = violations_vecs[:, 1:]    

In [243]:
violations_vecs.shape

(300, 14)