In [1]:
import pandas as pd
import os
import re
import pickle
import spacy
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.utils.extmath import randomized_svd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('../data/text_df.csv')

In [3]:
df.head()

Unnamed: 0,folder,extension,filename,absolute_path,size_mb,created_on,last_modified_on,unix_permission,raw_text
0,../data/t5/,.text,000081.text,../data/t5/000081.text,0.01315,2011-02-08 15:37:52,2011-02-08 15:37:52,-rw-r--r--,
1,../data/t5/,.text,000083.text,../data/t5/000083.text,0.007671,2011-02-08 15:37:52,2011-02-08 15:37:52,-rw-r--r--,PUBLIC NOTICE\n FEDERAL COMMUNICATIONS COMMISS...
2,../data/t5/,.text,000086.text,../data/t5/000086.text,0.02627,2011-02-08 15:37:52,2011-02-08 15:37:52,-rw-r--r--,"Time,F-Scale,Location,County,State,Lat,Lon,Com..."
3,../data/t5/,.text,000087.text,../data/t5/000087.text,0.023494,2011-02-08 15:37:52,2011-02-08 15:37:52,-rw-r--r--,**********************************************...
4,../data/t5/,.text,000088.text,../data/t5/000088.text,0.004139,2011-02-08 15:37:52,2011-02-08 15:37:52,-rw-r--r--,76 APPENDIX.\n able...


In [4]:
# filenames without text
files_w_text = df[df['raw_text'].isna()]
pickle.dump(list(files_w_text['filename'].values), open('../data/file_w_text.pkl', 'wb'))
df.drop(index=files_w_text.index, inplace=True)

In [84]:
nlp = spacy.load('en_core_web_lg')

def lemmatizing(text, nlp):
    doc = nlp(text.replace('\n', ' '))
    sent = []

    for token in doc:
        if token.pos_ not in ('PROPN', 'PUNCT', 'NUM', 'SYM'):
            if token.lemma_ != '-PRON-':
                sent.append(token.lemma_)
    
    sent = " ".join(sent).split()
    
    return " ".join(sent)

In [54]:
def stemming(text):
    stemmer = nltk.stem.SnowballStemmer('english', ignore_stopwords=True)
    stemmer.stopwords = stopwords.words('english')
    
    stemmed_words = []
    
    for w in text.split():
        stemmed_words.append(stemmer.stem(w))
        
    text = ' '.join(stemmed_words)
        
    return text

In [85]:
df['lemmatized_text'] = df['raw_text'].apply(lambda row: lemmatizing(row, nlp))

In [87]:
df['stemmed_text'] = df['lemmatized_text'].apply(lambda row: stemming(row))

In [92]:
df.to_csv('../data/lem_stem_text.csv', index=False)

In [88]:
params = {
    'vectorizer': {
        'analyzer': 'word',
        'stop_words': stopwords.words('english'),
        'ngram_range': (1, 1),
        'token_pattern': '[a-z]{3,}',
        'min_df': 0.01,
        'lowercase': True
    },
    'raw_documents': df['stemmed_text'],
    'components': 4,
}

In [89]:
tfidf = TfidfVectorizer(**params['vectorizer'])
X_tfidf = tfidf.fit_transform(raw_documents=params['raw_documents'])

tfidf_doc_word = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names())
tfidf_doc_word

Unnamed: 0,abandon,abat,abbrevi,abil,abl,abnorm,aboard,abroad,abrupt,absenc,...,yes,yesterday,yet,yield,young,youth,zero,zinc,zip,zone
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.033480
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.009871,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.012723,0.000000,0.0,0.0,0.0,0.000000,0.0,0.028449
3,0.0,0.0,0.0,0.0,0.032074,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.041315,0.0,0.0,0.0,0.000000,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3244,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.017202,...,0.0,0.0,0.000000,0.008728,0.0,0.0,0.0,0.000000,0.0,0.016507
3245,0.0,0.0,0.0,0.0,0.003452,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
3246,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.007353,0.0,0.0,0.0,0.023367,0.0,0.000000
3247,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.003863,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000


In [93]:
from sklearn.decomposition import NMF

# Use NMF to look for 15 topics
n_topics = 15
model = NMF(n_components=n_topics)
model.fit(X_tfidf)

# Print the top 10 words
n_words = 10
feature_names = tfidf.get_feature_names()

topic_list = []
for topic_idx, topic in enumerate(model.components_):
    top_n = [feature_names[i]
             for i in topic.argsort()
             [-n_words:]][::-1]
    top_features = ' '.join(top_n)
    topic_list.append(f"topic_{'_'.join(top_n[:3])}") 

    print(f"Topic {topic_idx}: {top_features}")

Topic 0: program provid servic child develop plan fund train includ health
Topic 1: magnet beam use model corrector measur system temperatur test energi
Topic 2: font color decor text famili size none weight background bold
Topic 3: por con del que est una persona son sin vez
Topic 4: generic drug brand product prescript prefer patent medic name use
Topic 5: court appeal claim defend case petition district motion trial attorney
Topic 6: water speci plant area habitat soil fish beach site veget
Topic 7: file datum inform name use date line number com metadata
Topic 8: say get year would work peopl time make like come
Topic 9: site displac offset atmospher email load solid alon coordin accord
Topic 10: link protein gene genom nucleotid var name cell search cite
Topic 11: imag src image gif new button earth mission stereo view
Topic 12: shall section requir dwell amend applic cost build person properti
Topic 13: locat scan nomin floor zone unit rate hear seri data
Topic 14: document funct

In [104]:
%%time

from sklearn.decomposition import LatentDirichletAllocation

# Beware it will try *all* of the combinations, so it'll take ages
search_params = {
  'n_components': [5, 10, 15, 20, 25, 30, 40, 50],
  'learning_decay': [.2, .5, .7]
}

# Set up LDA with the options we'll keep static
model = LatentDirichletAllocation(learning_method='online')

# Try all of the options
gridsearch = GridSearchCV(model, param_grid=search_params, n_jobs=-1, verbose=1)
gridsearch.fit(X_tfidf)

# What did we find?
print("Best Model's Params: ", gridsearch.best_params_)
print("Best Log Likelihood Score: ", gridsearch.best_score_)
lda_params = gridsearch.best_params_.copy()

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 13.0min finished


Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Best Log Likelihood Score:  -50907.760986031586
CPU times: user 9.69 s, sys: 195 ms, total: 9.88 s
Wall time: 13min 12s


In [105]:
lda_params = gridsearch.best_params_.copy()

In [108]:
%%time

# Use LDA to look for 5 topics
learning_decay, n_topics = lda_params.values()
model = LatentDirichletAllocation(learning_method='online', n_components=n_topics, learning_decay=learning_decay)
model.fit(X_tfidf)

# Print the top 10 words per topic
n_words = 10
feature_names = tfidf.get_feature_names()

topic_list = []
for topic_idx, topic in enumerate(model.components_):
    top_n = [feature_names[i]
             for i in topic.argsort()
             [-n_words:]][::-1]
    top_features = ' '.join(top_n)
    topic_list.append(f"topic_{'_'.join(top_n[:3])}") 

    print(f"Topic {topic_idx}: {top_features}")

Topic 0: use water speci datum temperatur surfac measur sampl high magnet
Topic 1: con por del que est una persona son diabet sin
Topic 2: font imag function link var document els color com url
Topic 3: altitud unnecessari hatch pore receptor ment airborn ozon extric hinder
Topic 4: use provid year cost inform requir drug program includ may
CPU times: user 16.4 s, sys: 118 ms, total: 16.5 s
Wall time: 18.8 s


In [109]:
# Convert our counts into numbers
amounts = model.transform(X_tfidf) * 100

# Set it up as a dataframe
topics = pd.DataFrame(amounts, columns=topic_list)
topics.head(2)

Unnamed: 0,topic_use_water_speci,topic_con_por_del,topic_font_imag_function,topic_altitud_unnecessari_hatch,topic_use_provid_year
0,2.143923,2.016244,2.021396,2.016737,91.8017
1,63.816697,2.887513,2.742815,2.738853,27.814121


In [147]:
label = df['stemmed_text'].index
doc_topic = pd.DataFrame(model.fit_transform(tfidf.fit_transform(df['stemmed_text'])).round(5),
                         index = label,
                         columns=['topic{}'.format(i + 1) for i in range(n_topics)])

df['top_topic'] = doc_topic.idxmax(axis=1)

In [148]:
doc_topic

Unnamed: 0,topic1,topic2,topic3,topic4,topic5
1,0.02015,0.02016,0.91940,0.02015,0.02015
2,0.02739,0.02742,0.88952,0.02829,0.02739
3,0.01943,0.01953,0.92218,0.01943,0.01943
4,0.01781,0.01781,0.92729,0.01928,0.01781
5,0.04549,0.04547,0.81808,0.04548,0.04549
...,...,...,...,...,...
3655,0.02436,0.02438,0.90254,0.02436,0.02436
3656,0.01895,0.01898,0.92417,0.01896,0.01895
3657,0.02370,0.02370,0.90519,0.02370,0.02370
3658,0.01567,0.01568,0.93729,0.01570,0.01567


In [146]:
[e for e in doc_topic.iloc[0].values if (e >= doc_topic.iloc[0].values.max() / 4)]

[0.91835]

In [138]:
doc_topic.iloc[0].where(doc_topic.iloc[0] > 10)

AttributeError: module 'pandas' has no attribute 'where'

In [114]:
df[['top_topic', 'absolute_path']].groupby(by='top_topic').count()

Unnamed: 0_level_0,absolute_path
top_topic,Unnamed: 1_level_1
topic1,106
topic2,1025
topic4,62
topic5,2056


In [115]:
df[['top_topic', 'absolute_path']]

Unnamed: 0,top_topic,absolute_path
1,topic5,../data/t5/000083.text
2,topic2,../data/t5/000086.text
3,topic5,../data/t5/000087.text
4,topic5,../data/t5/000088.text
5,topic5,../data/t5/000089.text
...,...,...
3655,topic2,../data/t5/004995.pdf
3656,topic2,../data/t5/004996.pdf
3657,topic2,../data/t5/004997.pdf
3658,topic2,../data/t5/004998.pdf


In [144]:
import scipy
from sklearn.metrics import pairwise

In [148]:
scipy.spatial.distance.

In [147]:
pairwise.PAIRWISE_DISTANCE_FUNCTIONS.keys()

dict_keys(['cityblock', 'cosine', 'euclidean', 'haversine', 'l2', 'l1', 'manhattan', 'precomputed', 'nan_euclidean'])

In [157]:
df.columns

Index(['folder', 'extension', 'filename', 'absolute_path', 'size_mb',
       'created_on', 'last_modified_on', 'unix_permission', 'raw_text',
       'lemmatized_text', 'stemmed_text', 'top_topic'],
      dtype='object')