In [2]:
import pandas as pd
import spacy
import sklearn.feature_extraction.text as sk_text
import sklearn.decomposition           as sk_decomp
import sklearn.model_selection         as sk_cv
import string

# Load the data

In [3]:
from utilities import *

In [4]:
quora = pd.read_csv('quora_questions.csv')
quora.rename(lambda x : x.lower(), axis = 'columns', inplace = True)
quora.head()

Unnamed: 0,question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


# Preprocessing
Simplest option first: Count Vectorizer to create term-document matrix

In [5]:
# Tokenizer: uses spaCy tokenizer, then removes punctuation and stop words, then finally casts
bad_predicates = [
    lambda token : token.is_stop,
    lambda token : token.text in string.punctuation
]

filter_tokenizer = lmap_filter(
    lambda t : t.lower_,
    no_predicates(*bad_predicates),
)

spacy_tokenizer = spacy.load("en_core_web_sm", disable=['tagger', 'parser', 'ner'])

tokenizer       = compose(filter_tokenizer, spacy_tokenizer)

# Disabling most spaCy features leads to decent tokenizer performance 
#  For comparison you can try un-disabling tagging
%timeit tokenizer(quora.question[0])

67.8 µs ± 389 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [6]:
count_vectorizor = sk_text.CountVectorizer(
                       strip_accents= 'unicode', 
                       lowercase    = True,
                       analyzer     = tokenizer,
                       max_df       = 0.9,
                       min_df       = 10
)

Timer.start()
X = count_vectorizor.fit_transform(quora.question)
Timer.end()
print(f'{X.shape[1]} word identified with the given frequency cutoffs')

41.724 seconds elapsed
14744 word identified with the given frequency cutoffs


# LDA Modelling

#### TASK: Using Scikit-Learn create an instance of LDA. 

- You can manually run and tune your model, then evaluate the resulting clusters. 
- Or you can use gridsearch to try and identify the best number of topics to use. 


In [24]:
learner = sk_decomp.LatentDirichletAllocation(n_jobs = -1, verbose = 1, random_state = 42)
model, model_data = fit_or_load_model(learner, X, 'simple-lda', force_fit = False)

Loading model from disk
642.725 seconds elapsed in original fitting


In [None]:
learner = sk_decomp.LatentDirichletAllocation(n_jobs = -1, verbose = 1, random_state = 42)
param_grid = {'n_components': [15, 20, 25],'learning_decay': [.5, .7]}
splitter = sk_cv.KFold(n_splits = 5, shuffle = True, random_state = 42)
model = sk_cv.GridSearchCV(learner, param_grid, verbose = 2, cv = splitter)
model, model_data = fit_or_load_model(model, X, 'simple-lda-grid', force_fit = False)

#### Task: Evaluate the different models you have run and determine which model you think determines the best clusters.  


The evaluation part could invlove:
- Printing out the top 15 most common words for each of the topics and seeing if they make sense.
- Using the perplexity and log-likelihoood scores.
- Using the pyLDAvis tool to investigate the different clusters. 

#### TASK: Add a new column to the original quora dataframe that labels each question into one of the topic categories.

# Great job!