## Imports

In [2]:
import warnings

# def fxn():
#     warnings.warn("deprecated", DeprecationWarning)

# with warnings.catch_warnings():
#     warnings.simplefilter("ignore")
#     fxn()
    
warnings.filterwarnings('ignore')

#warning website: https://queirozf.com/entries/suppressing-ignoring-warnings-in-python-reference-and-examples

In [3]:
import pandas as pd
import numpy as np
import re
import string

from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

import pyLDAvis
import pyLDAvis.gensim as gensimvis
import pyLDAvis.sklearn

In [4]:
# Way around missing packages that for some reason I can't download the normal way

import os
import nltk

# Create NLTK data directory
NLTK_DATA_DIR = './nltk_data'
if not os.path.exists(NLTK_DATA_DIR):
        os.makedirs(NLTK_DATA_DIR)

nltk.data.path.append(NLTK_DATA_DIR)

# Download packages and store in directory above
nltk.download('punkt', download_dir=NLTK_DATA_DIR)
nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_DIR)
nltk.download('wordnet', download_dir=NLTK_DATA_DIR)

[nltk_data] Downloading package punkt to ./nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     ./nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to ./nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data Import

In [5]:
df = pd.read_csv("data/doj.csv")
df = df.drop(columns=['id','topics','components'], axis=1)

## Data Cleaning

In [6]:
df['contents'] = df['contents'].astype(str)
df['contents_processed'] = \
df['contents'].map(lambda x: re.sub('[,\.!?]','',x))
df['contents_processed'] = \
df['contents_processed'].map(lambda x: x.lower())
df['contents_processed']

df['title'] = df['title'].astype(str)
df['title_processed'] = \
df['title'].map(lambda x: re.sub('[,\.!?]','',x))
df['title_processed'] = \
df['title_processed'].map(lambda x: x.lower())
df['title_processed']

df = df.drop(columns=['title', 'contents'])
df

  df['contents'].map(lambda x: re.sub('[,\.!?]','',x))
  df['title'].map(lambda x: re.sub('[,\.!?]','',x))


Unnamed: 0,date,contents_processed,title_processed
0,2018-07-27,the us district court for the northern distric...,district court enters permanent injunction aga...
1,2018-07-27,the department of justice announced today that...,justice department announces resolution with ...
2,2018-07-27,yesterday a federal judge found ahmed el-sheri...,kansas city area laboratory owner convicted of...
3,2018-07-26,the department of justice announced today that...,3m company agrees to pay $91 million to resolv...
4,2018-07-26,a federal grand jury in new orleans louisiana ...,amite residents charged with civil rights crim...
...,...,...,...
17819,2018-07-30,washington – a jordanian national residing in ...,jordanian national arrested in new york to fac...
17820,2018-07-30,a charlottesville virginia man was sentenced t...,former virginia high school science teacher se...
17821,2018-07-30,a federal jury convicted a sterling virginia w...,former owner of sleep study businesses convict...
17822,2018-07-30,remarks as prepared for delivery\r\nthank you ...,attorney general sessions delivers remarks at ...


## LDA Modelling

In [12]:
#setting max rows to see more throughout process

pd.set_option('display.max_rows', 50)

#looking at vectorizor in order to see what words might need to be added to the stop word list
text = " ".join(review for review in df.title_processed)

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words = 'english')
matrix = vectorizer.fit_transform([text])

counts = pd.DataFrame(matrix.toarray(),
                      columns=vectorizer.get_feature_names())
sorted_count = counts.T.sort_values(by=0, ascending=False).head(20)
sorted_count['word'] = sorted_count.index
sorted_list = list(sorted_count['word'])
sorted_list

['justice',
 'department',
 'guilty',
 'sentenced',
 'fraud',
 'pleads',
 'tax',
 'million',
 'prison',
 'scheme',
 'man',
 'attorney',
 'general',
 'federal',
 'charged',
 'false',
 'conspiracy',
 'pay',
 'indicted',
 'new']

In [13]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

  from scipy.linalg.special_matrices import triu
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachaellam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
stop_words = stopwords.words('english')
stop_words.extend([sorted_list,'us', 'years', 'one', 'also', 'district'])

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
            if word not in stop_words] for doc in texts]

title_data = df.title_processed.values.tolist()
title_words = list(sent_to_words(title_data))
# remove stop words
title_words = remove_stopwords(title_words)
print(title_words[:1][0][:30])

['court', 'enters', 'permanent', 'injunction', 'chicago', 'companies', 'stop', 'distribution', 'adulterated', 'misbranded', 'dietary', 'supplements', 'unapproved', 'misbranded', 'drugs']


In [15]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(title_words)
# Create Corpus
texts = title_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1)]


In [16]:
from pprint import pprint
# number of topics
num_topics = 5
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.021*"sentenced" + 0.021*"general" + 0.020*"attorney" + 0.015*"prison" + '
  '0.012*"department" + 0.011*"guilty" + 0.010*"justice" + 0.009*"pleads" + '
  '0.009*"united" + 0.008*"states"'),
 (1,
  '0.025*"guilty" + 0.021*"justice" + 0.021*"department" + 0.016*"fraud" + '
  '0.016*"pleads" + 0.014*"man" + 0.013*"sentenced" + 0.012*"scheme" + '
  '0.011*"federal" + 0.011*"million"'),
 (2,
  '0.030*"justice" + 0.027*"department" + 0.019*"tax" + 0.016*"sentenced" + '
  '0.015*"guilty" + 0.013*"fraud" + 0.013*"million" + 0.012*"scheme" + '
  '0.011*"prison" + 0.010*"pleads"'),
 (3,
  '0.021*"tax" + 0.020*"former" + 0.017*"sentenced" + 0.014*"false" + '
  '0.014*"guilty" + 0.014*"pleads" + 0.013*"prison" + 0.013*"million" + '
  '0.011*"attorney" + 0.010*"general"'),
 (4,
  '0.022*"justice" + 0.022*"department" + 0.018*"fraud" + 0.014*"guilty" + '
  '0.013*"sentenced" + 0.013*"million" + 0.012*"scheme" + 0.012*"pleads" + '
  '0.011*"man" + 0.009*"charges"')]


In [26]:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.display(LDAvis_prepared)

In [31]:
#LDA coherence

cm = gensim.models.coherencemodel.CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
print(cm.get_coherence())

0.28258746833472426


In [37]:
#LDA perplexity

print(lda_model.log_perplexity(corpus))

-7.357623397874899


## LDA Modelling with Tokenized Nouns

In [41]:
# tokenizing nouns for LDA
def nouns(text):
    is_noun = lambda pos : pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    wordnet_lemmatizer = WordNetLemmatizer()
    all_nouns = [wordnet_lemmatizer.lemmatize(word) \
                for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return ' '.join(all_nouns)

data_nouns = pd.DataFrame(df.title_processed.apply(nouns))

data_nouns.head()

Unnamed: 0,title_processed
0,district court enters injunction chicago compa...
1,justice department announces resolution asset ...
2,kansa city area laboratory owner waste
3,company allegation state combat arm
4,resident right crime family member disability


In [43]:
doj_nouns_data = data_nouns.title_processed.values.tolist()
doj_nouns_words = list(sent_to_words(doj_nouns_data))
# remove stop words
doj_noun_words = remove_stopwords(doj_nouns_words)
print(doj_nouns_words[:1][0][:30])

['district', 'court', 'enters', 'injunction', 'chicago', 'company', 'distribution', 'supplement', 'drug']


In [97]:
doj_nouns_data

['district court enters injunction chicago company distribution supplement drug',
 'justice department announces resolution asset management firm sa',
 'kansa city area laboratory owner waste',
 'company allegation state combat arm',
 'resident right crime family member disability',
 'city jacksonville agrees employment discrimination lawsuit',
 'police officer court conspiracy right deprivation right',
 'justice department citizenship felon drug organization florida',
 'circuit midco tax shelter',
 'construction co vice president claim aqaba school project',
 'tire retailer prison excise tax conspiracy',
 'deputy assistant attorney matthew s miner remark conference institute forum anti-corruption compliance risk market',
 'massachusetts man woman addiction',
 'circuit validity regulation',
 'owner equipment company medicaid',
 'deutsche bank trader trading practice commodity market',
 'member venezuelan money scheme',
 'virginia man role odometer title fraud scheme',
 'department just

In [44]:
# Create Dictionary with doj_nouns
id2word_nouns = corpora.Dictionary(doj_nouns_words)
# Create Corpus
texts_nouns = doj_nouns_words
# Term Document Frequency
corpus_nouns = [id2word_nouns.doc2bow(text) for text in texts_nouns]
# View
print(corpus_nouns[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]


In [79]:
texts_nouns

[['district',
  'court',
  'enters',
  'injunction',
  'chicago',
  'company',
  'distribution',
  'supplement',
  'drug'],
 ['justice',
  'department',
  'announces',
  'resolution',
  'asset',
  'management',
  'firm',
  'sa'],
 ['kansa', 'city', 'area', 'laboratory', 'owner', 'waste'],
 ['company', 'allegation', 'state', 'combat', 'arm'],
 ['resident', 'right', 'crime', 'family', 'member', 'disability'],
 ['city', 'jacksonville', 'agrees', 'employment', 'discrimination', 'lawsuit'],
 ['police', 'officer', 'court', 'conspiracy', 'right', 'deprivation', 'right'],
 ['justice',
  'department',
  'citizenship',
  'felon',
  'drug',
  'organization',
  'florida'],
 ['circuit', 'midco', 'tax', 'shelter'],
 ['construction',
  'co',
  'vice',
  'president',
  'claim',
  'aqaba',
  'school',
  'project'],
 ['tire', 'retailer', 'prison', 'excise', 'tax', 'conspiracy'],
 ['deputy',
  'assistant',
  'attorney',
  'matthew',
  'miner',
  'remark',
  'conference',
  'institute',
  'forum',
  'anti

In [45]:
# Build LDA model with nouns
num_topics = 10
nouns_lda_model = gensim.models.LdaMulticore(corpus=corpus_nouns,
                                       id2word=id2word_nouns,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(nouns_lda_model.print_topics())
nouns_doc_lda = nouns_lda_model[corpus]

[(0,
  '0.023*"scheme" + 0.021*"company" + 0.021*"justice" + 0.021*"department" + '
  '0.020*"man" + 0.015*"charge" + 0.013*"prison" + 0.010*"state" + '
  '0.010*"owner" + 0.009*"bribery"'),
 (1,
  '0.058*"department" + 0.057*"justice" + 0.044*"tax" + 0.024*"return" + '
  '0.021*"prison" + 0.020*"fraud" + 0.016*"man" + 0.015*"scheme" + '
  '0.011*"attorney" + 0.011*"company"'),
 (2,
  '0.033*"prison" + 0.028*"attorney" + 0.025*"man" + 0.020*"scheme" + '
  '0.020*"fraud" + 0.015*"tax" + 0.014*"claim" + 0.014*"year" + 0.012*"remark" '
  '+ 0.011*"delivers"'),
 (3,
  '0.044*"justice" + 0.042*"department" + 0.017*"fraud" + 0.016*"attorney" + '
  '0.012*"settlement" + 0.012*"conspiracy" + 0.011*"statement" + '
  '0.010*"scheme" + 0.010*"violation" + 0.010*"company"'),
 (4,
  '0.023*"attorney" + 0.022*"justice" + 0.020*"department" + 0.016*"prison" + '
  '0.012*"pleads" + 0.011*"drug" + 0.010*"state" + 0.009*"claim" + '
  '0.009*"fraud" + 0.009*"remark"'),
 (5,
  '0.035*"man" + 0.022*"scheme

In [47]:
#LDA coherence

cm = gensim.models.coherencemodel.CoherenceModel(model=nouns_lda_model, texts=texts_nouns, dictionary=id2word_nouns, coherence='c_v')
print(cm.get_coherence())

0.26288370631316516


In [49]:
#LDA perplexity

print(lda_model.log_perplexity(corpus_nouns))

-9.229741165400675


In [51]:
LDAvis_prepared_nouns = pyLDAvis.gensim.prepare(nouns_lda_model, corpus_nouns, id2word_nouns)
pyLDAvis.display(LDAvis_prepared_nouns)

## NMF Modelling 

In [65]:
#removing extra stop words
#stop_noun = ['america', 'today', 'thing', 'wwwjusticegov']
#stop_words_noun_agg = text.ENGLISH_STOP_WORDS.union(stop_noun)

tv_noun = TfidfVectorizer(stop_words="english", ngram_range = (1,1), max_df=0.9, min_df=0.01)

data_tv_noun = tv_noun.fit_transform(data_nouns.title_processed)

data_dtm_noun = pd.DataFrame(data_tv_noun.toarray(), columns = tv_noun.get_feature_names())

data_dtm_noun.index = df.index

data_dtm_noun



Unnamed: 0,act,agency,agreement,air,alabama,allegation,announces,assault,assistant,attorney,...,tax,texas,theft,trafficking,victim,violation,virginia,woman,year,york
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.795845,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.640413,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17819,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.587422
17820,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.440941,0.0,0.364328,0.000000
17821,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
17822,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.338477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000


In [66]:
#iterate through topics
def display_topics(model, feature_names, num_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic", ix)
        else:
            print("\nTopic: '", topic_names[ix], "'")
        print(",".join([feature_names[i] \
            for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [68]:
nmf_model = NMF(10)

doc_topic = nmf_model.fit_transform(data_dtm_noun)

display_topics(nmf_model, tv_noun.get_feature_names(), 10)




Topic 0
department,justice,discrimination,lawsuit,settlement,agreement,announces,disability,right,county

Topic 1
fraud,scheme,role,medicare,health,care,owner,identity,connection,home

Topic 2
tax,return,preparer,court,evasion,business,refund,fraud,owner,identity

Topic 3
man,crime,support,child,pornography,charge,california,virginia,texas,york

Topic 4
attorney,statement,remark,holder,delivers,deputy,assistant,session,announces,division

Topic 5
claim,allegation,discrimination,health,service,kickback,care,government,contractor,drug

Topic 6
prison,month,year,child,member,pornography,role,officer,woman,virginia

Topic 7
conspiracy,pleads,charge,member,officer,racketeering,executive,bribery,right,drug

Topic 8
state,settlement,violation,act,lawsuit,charge,air,employee,agreement,drug

Topic 9
company,owner,crime,executive,act,business,violation,president,penalty,service




In [94]:
kmin, kmax = 2, 10

topic_models = []
# try each value of k
for k in range(kmin,kmax+1):
    print("Applying NMF for k=%d ..." % k )
    # run NMF
    model = NMF(n_components=k) 
    W = model.fit_transform( data_dtm_noun )
    H = model.components_    
    # store for later
    topic_models.append( (k,W,H) )

class TokenGenerator:
    def __init__( self, documents, stopwords ):
        self.documents = documents
        self.stopwords = stopwords
        self.tokenizer = re.compile( r"(?u)\b\w\w+\b" )

    def __iter__( self ):
        print("Building Word2Vec model ...")
        for doc in self.documents:
            tokens = []
            for tok in self.tokenizer.findall( doc ):
                if tok.lower() in self.stopwords:
                    tokens.append( "<stopword>" )
                elif len(tok) >= 2:
                    tokens.append( tok.lower() )
            yield tokens

docgen = TokenGenerator(docs_raw, stop_words)
w2v_model = gensim.models.Word2Vec(docgen, size=500, min_count=20, sg=1)

def calculate_coherence( w2v_model, term_rankings ):
    overall_coherence = 0.0
    for topic_index in range(len(term_rankings)):
        # check each pair of terms
        pair_scores = []
        for pair in combinations( term_rankings[topic_index], 2 ):
            #print(str(pair[0]) + " " + str(pair[1]))
            pair_scores.append( w2v_model.similarity(pair[0], pair[1]))
        # get the mean for all pairs in this topic
        topic_score = sum(pair_scores) / len(pair_scores)
        overall_coherence += topic_score
    # get the mean score across all topics
    return overall_coherence / len(term_rankings)

def get_descriptor( all_terms, H, topic_index, top ):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    # now get the terms corresponding to the top-ranked indices
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append( all_terms[term_index] )
    return top_terms

k_values = []
coherences = []
for (k,W,H) in topic_models:
    # Get all of the topic descriptors - the term_rankings, based on top 10 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append( get_descriptor( terms, H, topic_index, 10 ) )
    # Now calculate the coherence based on our Word2vec model
    k_values.append( k )
    coherences.append( calculate_coherence( w2v_model, term_rankings ) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

%matplotlib inline
plt.style.use("ggplot")
matplotlib.rcParams.update({"font.size": 14})

fig = plt.figure(figsize=(13,7))
# create the line plot
ax = plt.plot( k_values, coherences )
plt.xticks(k_values)
plt.xlabel("Number of Topics")
plt.ylabel("Mean Coherence")
# add the points
plt.scatter( k_values, coherences, s=120)
# find and annotate the maximum point on the plot
ymax = max(coherences)
xpos = coherences.index(ymax)
best_k = k_values[xpos]
plt.annotate( "k=%d" % best_k, xy=(best_k, ymax), xytext=(best_k, ymax), textcoords="offset points", fontsize=16)
# show the plot
plt.show()


Applying NMF for k=2 ...




Applying NMF for k=3 ...




Applying NMF for k=4 ...




Applying NMF for k=5 ...




Applying NMF for k=6 ...




Applying NMF for k=7 ...
Applying NMF for k=8 ...




Applying NMF for k=9 ...




Applying NMF for k=10 ...




Applying NMF for k=11 ...




Applying NMF for k=12 ...




Applying NMF for k=13 ...




Applying NMF for k=14 ...




Applying NMF for k=15 ...




Applying NMF for k=16 ...




Applying NMF for k=17 ...




Applying NMF for k=18 ...




Applying NMF for k=19 ...




Applying NMF for k=20 ...




Applying NMF for k=21 ...




Applying NMF for k=22 ...




Applying NMF for k=23 ...




Applying NMF for k=24 ...




Applying NMF for k=25 ...




Applying NMF for k=26 ...




Applying NMF for k=27 ...




Applying NMF for k=28 ...




Applying NMF for k=29 ...




Applying NMF for k=30 ...




NameError: name 'docs_raw' is not defined

## BTM

In [23]:
import bitermplus as btm
import tmplot as tmp

ModuleNotFoundError: No module named 'tmplot'

In [18]:
texts = df['title_processed'].str.strip().tolist()

In [20]:
# PREPROCESSING
# Obtaining terms frequency in a sparse matrix and corpus vocabulary
X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
tf = np.array(X.sum(axis=0)).ravel()
# Vectorizing documents
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
docs_lens = list(map(len, docs_vec))
# Generating biterms
biterms = btm.get_biterms(docs_vec)

In [21]:
# INITIALIZING AND RUNNING MODEL
model = btm.BTM(
    X, vocabulary, seed=12321, T=8, M=20, alpha=50/8, beta=0.01)
model.fit(biterms, iterations=20)
p_zd = model.transform(docs_vec)

100%|███████████████████████████████████████████| 20/20 [00:02<00:00,  9.50it/s]
100%|█████████████████████████████████| 17824/17824 [00:00<00:00, 257290.22it/s]


In [22]:
# METRICS
perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
coherence = btm.coherence(model.matrix_topics_words_, X, M=20)

array([-535.86793267, -435.11704641, -524.95231637, -463.4982609 ,
       -508.71228636, -436.74421574, -440.03327189, -466.88283029])