# Text Clustering
In this document, I will cluster text data for violation from across many cities in the state to see if it creates a reasonable categories to use for analysis.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle

import spacy
import en_core_web_lg
from collections import Counter
import nltk
from nltk.corpus import stopwords
import string
import re


pd.set_option('precision', 4)
pd.options.display.max_seq_items = 100
pd.options.display.max_columns = 50
plt.style.use('fivethirtyeight')

  import pandas.util.testing as tm


In [2]:
with open ('DATA/text.txt', 'rb') as filepath:
    text_list = pickle.load(filepath)

In [3]:
len(text_list)

75996170

It's a huge dataset, we should be careful dealing with them.

In [4]:
df = pd.DataFrame(text_list, columns=['violations'])

## Text Preprocessing


In [12]:
pattern = "([a-zA-Z]+[\&[a-zA-Z]+]?)"
p = re.compile(pattern)

In [13]:
df['violations'] = df.violations.map(lambda x: ' '.join(p.findall(str(x).upper())))

In [14]:
# check for duplicates
df.drop_duplicates(inplace = True)

In [15]:
df.shape

(439438, 1)

That's substantially less, so that's great! Let's save them out.

In [16]:
with open("DATA/text_clean.txt", "wb") as filepath:
    pickle.dump(df.violations.values, filepath)

In [17]:
tmp = [x.lower() for x in list(df.violations)]

In [12]:
nlp = en_core_web_lg.load()

In [26]:
def replace_texts(list_of_text, replacement_dict):
    for i, text in enumerate(list_of_text):
        text = text.split()
        for j, te in enumerate(text):
            if te in replacement_dict: 
                text[j] = replacement_dict[te]
        list_of_text[i] = ' '.join(text)
    return list_of_text

In [31]:
def consolidate_words(textlist, 
                      n = 1, 
                      thresh = .8, 
                      nlp = None, 
                      replace = False, 
                      manual_spellcheck = False, 
                      remove = False):
    '''
    Takes a list of texts to run iteration
    Change word that occurs only once to similar word in the text
    Return consolidated list 
    '''
    if nlp == None:
        nlp = en_core_web_lg.load()
    
    list_of_text = textlist.copy()
    
    wordcounts = Counter(' '.join(list_of_text).split())    
    words_once = [k for k, v in wordcounts.items() if v <= n]
    other_words = [k for k, v in wordcounts.items() if v > n]
    
    tokens = nlp(' '.join(other_words))
    replacement_dict = {}
    
    for word in words_once:

        word_token = nlp(word)
        max_similarity = thresh

        for tk in tokens:
            # find the maximum similarity above threshold
            sim_score = word_token.text, tk.text, word_token.similarity(tk)
            if 1 > sim_score[2] > max_similarity:
                replacement_dict[word] = sim_score[1]
                max_similarity = sim_score[2]
        try:
            print(word, 'to', replacement_dict[word])
        except KeyError:
            if manual_spellcheck:
                ans = input(f'{word} does not have a replacement. If you have a suggesion, type the word, otherwise press spacebar')
                if ans == ' ':
                    continue
                else:
                    replacement_dict[word] = ans
            if remove: 
                replacement_dict[word] = ''
            else:
                continue
    
    if replace:
        list_of_text = replace_texts(list_of_text, replacement_dict)
        return list_of_text
    
    else:
        return replacement_dict

In [None]:
violations = consolidate_words(tmp, n = 50, nlp = nlp, replace = True, remove = True)

In [14]:
with open("DATA/text_clean.txt", "wb") as filepath:
    pickle.dump(violations, filepath)

In [3]:
with open("DATA/text_clean.txt", "rb") as filepath:
    violations = pickle.load(filepath)

In [13]:
violations = consolidate_words(violations, n = 50, nlp = nlp, replace = True, remove = True)



### Lemmatization

In [22]:
from nltk.stem import WordNetLemmatizer

In [23]:
# lemmatization
wnl = WordNetLemmatizer()

In [17]:
wordcounts = Counter(' '.join(violations).split())

In [28]:
replace_dict = {}
for k in wordcounts.keys():
    rep = wnl.lemmatize(k)
    if rep != k: 
        replace_dict[k] = wnl.lemmatize(k)

In [32]:
violations = replace_texts(violations, replace_dict)

### Unique words
Let's get a set of unique words and make some final edits.

In [34]:
wordcounts = Counter(' '.join(violations).split())

In [36]:
unique_words = list(wordcounts.keys())

### Stopwords Removal
Let's remove some stopwords.

In [38]:
from nltk.corpus import stopwords

In [43]:
sw = stopwords.words('English')

In [45]:
unique_words = [x for x in unique_words if x not in sw]

In [47]:
len(unique_words)

1673

## Get Word Embeddings
Now I'll get the word embeddings for each words.