In [1]:
import pandas as pd
import spacy
import sklearn.feature_extraction.text as sk_text
import sklearn.decomposition           as sk_decomp
import sklearn.model_selection         as sk_cv
import string
import pyLDAvis
import pyLDAvis.sklearn
import gc
import datetime

In [2]:
pd.set_option('display.max_columns', 100)

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
%matplotlib inline

In [5]:
from utilities import *

# Load the data

In [6]:
quora = pd.read_csv('quora_questions.csv')
quora.rename(lambda x : x.lower(), axis = 'columns', inplace = True)
quora.head()

Unnamed: 0,question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


# Preprocessing
Simplest option first: Count Vectorizer to create term-document matrix

In [7]:
# Tokenizer: uses spaCy tokenizer, then removes punctuation and stop words, then finally casts

spacy_tokenizer = spacy.load("en_core_web_sm", disable=['tagger', 'parser', 'ner'])

def token_filter(token):
    if token.is_stop:
        return False
    elif token.text in string.punctuation:
        return False
    else:
        return True

    
def tokenizer(string_):
    doc = spacy_tokenizer(string_)
    f   = filter(token_filter, doc)
    m   = [t.lower_ for t in f]
    return list(m)

# Disabling most spaCy features leads to decent tokenizer performance 
#  For comparison you can try un-disabling tagging
# 55.4 µs ± 4.29 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# %timeit tokenizer(quora.question[0])

In [8]:
# Sample the Data
# Use sk-learn's KFold (n=10) for convenience
splitter = sk_cv.KFold(n_splits = 10, shuffle = True, random_state = 42)

samples = []
for _, split_indices in splitter.split(quora.question):
    samples.append(quora.question.iloc[split_indices])
    
data = samples[0]

In [9]:
count_vectorizor = sk_text.CountVectorizer(
                       strip_accents= 'unicode', 
                       lowercase    = True,
                       analyzer     = tokenizer,
                       max_df       = 0.9,
                       min_df       = 2
)

X, count_vectorizor = persist('count-vectorizer',
                                  count_vectorizor,
                                 'fit_transform',
                                  data,
                                  task = 'both',
                                  force_fit = False
                                )
print(f'{X.shape[1]} word identified with the given frequency cutoffs')

Loading count-vectorizer from disk
4.111 seconds elapsed in original job
12108 word identified with the given frequency cutoffs


In [10]:
X_test = persist('count-vectorizer-test', count_vectorizor, 'transform', samples[1], task = 'data')

Loading count-vectorizer-test from disk
3.612 seconds elapsed in original job


## Simple LDA Model

### Fit

In [11]:
model = sk_decomp.LatentDirichletAllocation(n_jobs = -1, verbose = 1, random_state = 42,
                                              max_iter = 30)

topic_data, model = persist('simple-lda', model, 'fit_transform', X, task = 'both')

Loading simple-lda from disk
101.292 seconds elapsed in original job


### Evaluate

In [12]:
train_score      = persist('simple-lda-train-score', model, 'score', X, task = 'data', force_fit = False)
test_score       = persist('simple-lda-test-score', model, 'score', X_test, task = 'data', force_fit = False)
train_perplexity = persist('simple-lda-train-perplexity', model, 'perplexity', X, task = 'data', force_fit = False)
test_perplexity  = persist('simple-lda-test-perplexity', model, 'perplexity', X_test, task = 'data', force_fit = False)

print(f'Log likelihood - train {train_score : .3}\n'
      f'Log likelihood - test  {test_score : .3}\n'
      f'Perplexity     - train {train_perplexity : .3}\n'
      f'Perplexity     - test  {test_perplexity : .3}')

Loading simple-lda-train-score from disk
5.897 seconds elapsed in original job
Loading simple-lda-test-score from disk
6.297 seconds elapsed in original job
Loading simple-lda-train-perplexity from disk
8.880 seconds elapsed in original job
Loading simple-lda-test-perplexity from disk
9.953 seconds elapsed in original job
Log likelihood - train -1.62e+06
Log likelihood - test  -1.58e+06
Perplexity     - train  4.58e+03
Perplexity     - test   5.22e+03


### Inspect

In [13]:
# Most common words in each topic
words = count_vectorizor.get_feature_names()
num_words = 10

def get_top_words(array):
    return lmap(lambda n : words[n], array.argsort()[-num_words:])

top_words = pd.DataFrame(np.apply_along_axis(get_top_words, 1, model.components_).T)
top_words

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,president,country,change,sex,age,learn,know,earth,rs,energy
1,clinton,google,people,friend,service,free,programmi,science,governmen,year
2,people,india,account,answer,web,book,improve,meaning,new,girls
3,war,like,number,people,youtube,buy,things,time,iphone,water
4,good,better,thing,instagram,live,good,ways,computer,india,men
5,donald,china,feel,girl,online,online,english,examples,black,mean
6,think,lose,like,ask,increase,engineeri,language,possible,1000,differenc
7,trump,job,find,question,card,india,start,facebook,500,women
8,world,weight,time,questions,earn,way,learn,stop,indian,like
9,india,work,life,quora,money,best,best,people,notes,love


In [14]:
pyLDAvis.enable_notebook()
panel = persist('simple-lda-viz', pyLDAvis.sklearn, 'prepare', model, X, count_vectorizor, mds='tsne', task = 'data')
panel

Loading simple-lda-viz from disk
394.286 seconds elapsed in original job


## Grid Search

In [15]:
learner = sk_decomp.LatentDirichletAllocation(n_jobs = -1, verbose = 1, random_state = 42)
param_grid = {'n_components': [15, 20, 25],'learning_decay': [.5, .7]}
splitter = sk_cv.KFold(n_splits = 5, shuffle = True, random_state = 42)
model = sk_cv.GridSearchCV(learner, param_grid, verbose = 2, cv = splitter)

model = persist('grid-lda', model, 'fit', X, force_fit = False)

Loading grid-lda from disk
1104.442 seconds elapsed in original job


In [16]:
model.best_params_

{'learning_decay': 0.5, 'n_components': 15}

In [17]:
model = model.best_estimator_

In [18]:
train_score      = persist('grid-lda-train-score', model, 'score', X, task = 'data', force_fit = False)
test_score       = persist('grid-lda-test-score', model, 'score', X_test, task = 'data', force_fit = False)
train_perplexity = persist('grid-lda-train-perplexity', model, 'perplexity', X, task = 'data', force_fit = False)
test_perplexity  = persist('grid-lda-test-perplexity', model, 'perplexity', X_test, task = 'data', force_fit = False)

print(f'Log likelihood - train {train_score : .3}\n'
      f'Log likelihood - test  {test_score : .3}\n'
      f'Perplexity     - train {train_perplexity : .3}\n'
      f'Perplexity     - test  {test_perplexity : .3}')

Loading grid-lda-train-score from disk
6.939 seconds elapsed in original job
Loading grid-lda-test-score from disk
7.709 seconds elapsed in original job
Loading grid-lda-train-perplexity from disk
10.636 seconds elapsed in original job
Loading grid-lda-test-perplexity from disk
9.466 seconds elapsed in original job
Log likelihood - train -1.64e+06
Log likelihood - test  -1.61e+06
Perplexity     - train  4.97e+03
Perplexity     - test   6.05e+03


In [19]:
# Most common words in each topic
words = count_vectorizor.get_feature_names()
num_words = 10

def get_top_words(array):
    return lmap(lambda n : words[n], array.argsort()[-num_words:])

top_words = pd.DataFrame(np.apply_along_axis(get_top_words, 1, model.components_).T)
top_words

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,world,canada,eat,times,video,way,facts,states,modi,like,prepare,interview,college,people,find
1,good,engineer,purpose,important,card,tv,code,english,think,state,business,tell,energy,improve,password
2,hillary,like,time,test,today,10,bank,favorite,governmen,looking,programmi,tips,average,google,instagram
3,win,company,food,india,happen,laptop,market,believe,rs,big,language,process,going,ask,people
4,president,software,culture,days,age,windows,write,possible,people,differenc,free,good,day,question,phone
5,clinton,country,like,relations,5,india,interesti,travel,india,men,india,social,engineeri,lose,account
6,india,china,thing,3,youtube,read,history,earth,1000,earn,start,water,new,weight,black
7,war,india,think,system,increase,best,2016,people,500,women,learn,examples,year,world,number
8,donald,job,mean,2,person,school,ways,stop,indian,online,way,work,things,questions,like
9,trump,better,life,sex,years,buy,study,time,notes,money,best,love,know,quora,facebook


In [20]:
pyLDAvis.enable_notebook()
panel = persist('grid-lda-viz', pyLDAvis.sklearn, 'prepare', model, X, count_vectorizor, mds='tsne', task = 'data')
panel

Loading grid-lda-viz from disk
562.804 seconds elapsed in original job


In [21]:
topic_df = persist('grid-lda-predictions', model, 'transform', X, task = 'data')

num_topics = topic_df.shape[1]
topic_df = pd.DataFrame(topic_df, index = data.index).rename(lambda n : f'Topic {n+1}', axis = 'columns')

topic_df = pd.concat([
    data,
    topic_df
], axis = 'columns').set_index('question')

topic_df = topic_df.assign(
    class_         = topic_df.apply(lambda row : row.idxmax(), axis = 'columns'),
    class_strength = topic_df.apply(lambda row : row.max(),    axis = 'columns')
)

for i in range(num_topics):
    topic_name = f'Topic {i+1}'
    topic_qs   = topic_df[topic_df.class_ == topic_name].sort_values('class_strength', ascending = False)
    print(topic_name)
    for q in topic_qs.index.tolist()[:5]:
        print(q)
    print('-------------\n')

Loading grid-lda-predictions from disk
4.175 seconds elapsed in original job
Topic 1
What hotel in Nallamala Hills Hill-station would be safe for unmarried couples, without the harassment of police, hotel staff, and moral police?
What hotel in Nallamala Hills Hill-station would be safe for unmarried couples, without the harassment of police, hotel staff, and moral police?
What hotel in Nallamala Hills Hill-station would be safe for unmarried couples, without the harassment of police, hotel staff, and moral police?
What do you think of the supreme court order that cinema halls must play the national anthem with exits closed and all present required to stand up?
I do not have any savings. I only have an apartment worth 30 lakhs and I depend on a loan. Can I pursue a master's in the U.S with any bank accounts and white money?
-------------

Topic 2
I am working with a software company and they have asked me to work on a tool provided by PEGA. What information can you give to me on PEGA an

## word2vec

In [22]:
def is_bad_question(q):
    for char in q:
        if char in string.ascii_letters + string.digits:
            return False
    return True

In [23]:
data = data.loc[~data.apply(is_bad_question)]

In [24]:
quora = quora.loc[~quora.question.apply(is_bad_question)]

In [25]:
# Tokenizer: uses spaCy tokenizer, then removes punctuation and stop words, then finally casts

spacy_tokenizer_large = spacy.load("en_core_web_lg", disable=['tagger', 'parser', 'ner'])

In [26]:
def token_filter(token):
    if token.is_stop:
        return False
    elif token.text in string.punctuation:
        return False
    else:
        return True

    
def vector_tokenizer(string_):
    doc = spacy_tokenizer_large(string_)
    f   = filter(token_filter, doc)
    m   = [t.vector for t in f]
    return m

def doc2PCVec(string_):
    vectors = vector_tokenizer(string_)
    if not vectors:
        return pd.Series(np.zeros(300))
    return pd.Series(sk_decomp.PCA(n_components=1).fit(vector_tokenizer(string_)).components_[0])

In [27]:
# 841 µs ± 46 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# %timeit doc2PCVec(quora.question[0])

In [28]:
# 18.5 ms ± 825 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# %timeit pcaed_data = quora.question.iloc[:10].transform(doc2PCVec)

In [29]:
pcaed_data = persist('doc2pcVec', data, 'transform', doc2PCVec, task = 'data')

Loading doc2pcVec from disk
34.830 seconds elapsed in original job


In [30]:
class CarefulTransform:
    '''
        Runs a loop but periodically makes explicit calls to the garbage collector.
        Empirically causes infinite speedup in the following call.
    '''
    def __init__(self):
        pass
    
    def run(self, fn, data, job_len = 10000, verbose = True):
        self.dfs = []
        running_index = 0
        n_jobs = len(data) // job_len + 1
        print(f'{n_jobs} jobs to complete')
        start_time = datetime.datetime.now()
        for i in range(n_jobs):
            slice_ = data.iloc[running_index : running_index + job_len]
            self.dfs.append(slice_.transform(fn))
            running_index += job_len
            gc.collect()
            if (i+1) % (n_jobs // 100 + 1) == 0 and verbose:
                elapsed = (datetime.datetime.now()-start_time).seconds
                print(f'{i+1} jobs completed. {elapsed} seconds elapsed')
        return pd.concat(self.dfs, axis = 'rows')

In [31]:
pcaed_data = persist('doc2pcVec-all', CarefulTransform(), 'run', doc2PCVec, quora.question, task = 'data')

Loading doc2pcVec-all from disk
320.875 seconds elapsed in original job


In [32]:
import sklearn.cluster as sk_clust

In [33]:
# no parallel processing to conserve memory
model = sk_clust.KMeans(random_state = 42, n_jobs=1, verbose = 1, tol = 1)
model = persist('kmeans-doc2PCVec-all', model, 'fit', pcaed_data, task = 'model')

Loading kmeans-doc2PCVec-all from disk
502.873 seconds elapsed in original job


In [34]:
labels = pd.Series(model.predict(pcaed_data), index = quora.question.index)
quora = quora.assign(labels = labels)

In [35]:
quora.labels.value_counts()

2    82080
7    67331
1    63573
0    60226
4    54710
5    38755
6    30358
3     7241
Name: labels, dtype: int64