# Text Clustering
In this document, I will cluster text data for violation from across many cities in the state to see if it creates a reasonable categories to use for analysis.

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle

pd.set_option('precision', 4)
pd.options.display.max_seq_items = 100
pd.options.display.max_columns = 50
plt.style.use('fivethirtyeight')

In [37]:
with open ('DATA/text.txt', 'rb') as filepath:
    text_list = pickle.load(filepath)

In [39]:
len(text_list)

75996170

It's a huge dataset, we should be careful dealing with them.

In [30]:
df = pd.DataFrame(text_list, columns=['violations'])

## Text Preprocessing


In [31]:
import nltk
from nltk.corpus import stopwords
import string
import re

In [32]:
pattern = "([a-zA-Z]+[\&[a-zA-Z]+]?)"
p = re.compile(pattern)

In [33]:
df['violations'] = df.violations.map(lambda x: ' '.join(p.findall(str(x).upper())))

In [34]:
# check for duplicates
df.drop_duplicates(inplace = True)

In [35]:
df.shape

(439438, 1)

That's substantially less, so that's great! Let's save them out.

In [66]:
with open("DATA/text_clean.txt", "wb") as filepath:
    pickle.dump(df.violations.values, filepath)

In [16]:
import spacy
import en_core_web_lg
from collections import Counter

In [17]:
tmp = [x.lower() for x in list(df.violations)]

In [18]:
nlp = en_core_web_lg.load()

In [19]:
def consolidate_words(textlist, n = 1, thresh = .8, nlp = None, replace = False, manual_spellcheck = False):
    '''
    Takes a list of texts to run iteration
    Change word that occurs only once to similar word in the text
    Return consolidated list 
    '''
    if nlp == None:
        nlp = en_core_web_lg.load()
    
    list_of_text = textlist.copy()
    
    wordcounts = Counter(' '.join(list_of_text).split())    
    words_once = [k for k, v in wordcounts.items() if v == n]
    other_words = [k for k, v in wordcounts.items() if v > n]
    
    tokens = nlp(' '.join(other_words))
    replacement_dict = {}
    
    for word in words_once:

        word_token = nlp(word)
        max_similarity = thresh

        for tk in tokens:
            # find the maximum similarity above threshold
            sim_score = word_token.text, tk.text, word_token.similarity(tk)
            if 1 > sim_score[2] > max_similarity:
                replacement_dict[word] = sim_score[1]
                max_similarity = sim_score[2]
        try:
            print(word, 'to', replacement_dict[word])
        except KeyError:
            if manual_spellcheck:
                ans = input(f'{word} does not have a replacement. If you have a suggesion, type the word, otherwise press spacebar')
                if ans == ' ':
                    continue
                else:
                    replacement_dict[word] = ans
            else:
                continue
    
    if replace:
        for i, text in enumerate(list_of_text):

            text = text.split()
            for j, te in enumerate(text):
                if te in replacement_dict: 
                    text[j] = replacement_dict[te]
            list_of_text[i] = ' '.join(text)

        return list_of_text
    
    else:
        return replacement_dict

In [None]:
new_tmp = consolidate_words(tmp, nlp = nlp, replace = True)

In [None]:
# running one more
new_tmp2 = consolidate_words(new_tmp, n = 2, thresh = .85, nlp = nlp, replace = True)

In [76]:
df['violations'] = new_tmp2

### Lemmatization

In [60]:
from nltk.stem import WordNetLemmatizer

In [62]:
# lemmatization
wnl = WordNetLemmatizer()

In [70]:
df['violations'] = df.violations.apply(lambda x: [wnl.lemmatize(word) for word in x.split()])

### Unique words
Let's get a set of unique words and make some final edits.

In [72]:
unique = set()

for text in df.violations:
    unique = unique | set(text)

In [None]:
# get word counts
counts = Counter(np.sum(df.violations.values))

### More corrections
It seems like some words don't have proper spacing between them I'll try to add them.

In [96]:
# Credit for below code goes to Generic Human 
#(https://stackoverflow.com/questions/8870261/how-to-split-text-without-spaces-into-list-of-words) 

from math import log

# Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
words = 
wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
maxword = max(len(x) for x in words)

def infer_spaces(s):
    """Uses dynamic programming to infer the location of spaces in a string
    without spaces."""

    # Find the best match for the i first characters, assuming cost has
    # been built for the i-1 first characters.
    # Returns a pair (match_cost, match_length).
    def best_match(i):
        candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
        return min((c + wordcost.get(s[i-k-1:i], 9e999), k+1) for k,c in candidates)

    # Build the cost array.
    cost = [0]
    for i in range(1,len(s)+1):
        c,k = best_match(i)
        cost.append(c)

    # Backtrack to recover the minimal-cost string.
    out = []
    i = len(s)
    while i>0:
        c,k = best_match(i)
        assert c == cost[i]
        out.append(s[i-k:i])
        i -= k

    return " ".join(reversed(out))

FileNotFoundError: [Errno 2] No such file or directory: 'words-by-frequency.txt'

In [93]:
tm = nlp('warningdrivingtooslowinleftlane')

In [94]:
tm.vector

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

## Cleaning
First, I'll loop through all unique words and try to converge some of the similar ones. That will correct some typos.
Then I can look at clustering in two different ways
1. see individual word clustering
2. see the average coordinate clustering (average point of all words in each observation)

## Get Word Embeddings
Now I'll get the word embeddings for each words.