In [1]:
import pandas as pd

In [2]:
quora = pd.read_csv('quora_questions.csv')

In [3]:
quora.count()

Question    404289
dtype: int64

In [5]:
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [6]:
quora ['str_count'] = quora['Question'].str.len()

In [7]:
quora.head()

Unnamed: 0,Question,str_count
0,What is the step by step guide to invest in sh...,66
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,51
2,How can I increase the speed of my internet co...,73
3,Why am I mentally very lonely? How can I solve...,50
4,"Which one dissolve in water quikly sugar, salt...",76


# Preprocessing

Use TF-IDF Vectorization to create a vectorized document term matrix (DTM). 
Ymax_df to exclude words shows up in > 90% doc and min_df is set for words less than 2 times show up in whole documents

In [133]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

In [140]:
quora_stopwords = text.ENGLISH_STOP_WORDS.union(['good', 'best', 'did', 'does', 'use', 'using'])

In [160]:
cv = CountVectorizer(max_df = 0.9, min_df = 2, stop_words = quora_stopwords )

In [161]:
dtm = cv.fit_transform(quora['Question'])

In [162]:
dtm

<404289x38663 sparse matrix of type '<class 'numpy.int64'>'
	with 1911337 stored elements in Compressed Sparse Row format>

# LDA

In [144]:
from sklearn.decomposition import LatentDirichletAllocation

In [145]:
LDA = LatentDirichletAllocation(n_components = 7, random_state = 42) # 7 general topics will be returned

In [146]:
LDA.fit(dtm)

LatentDirichletAllocation(n_components=7, random_state=42)

In [159]:
LDA.batch_size

128

# Grab the vocabulary of words

In [163]:
len(cv.get_feature_names()) #amount of stored words

38663

In [164]:
import random

In [165]:
for i in range(10):
    random_word_id = random.randint(0,38669)
    print(cv.get_feature_names()[random_word_id])

loco
aberdeen
fractal
giza
eie
delhi
institute
azeri
desired
slice


In [166]:
for i in range(10):
    random_word_id = random.randint(0,38972)
    print(cv.get_feature_names()[random_word_id])

ipsum
male
laminate
resource
wrapped
knocks
nights
ef
walking
truant


In [167]:
len(LDA.components_)

7

In [168]:
LDA.components_  

array([[1.43063287e-01, 4.02367150e-01, 1.43760404e-01, ...,
        1.42857260e-01, 1.42857426e-01, 1.42857260e-01],
       [1.42934128e-01, 1.42915011e-01, 1.42857429e-01, ...,
        2.14285637e+00, 1.42857462e-01, 2.14285637e+00],
       [2.06157239e+01, 1.42980951e-01, 1.42857392e-01, ...,
        1.42857262e-01, 2.14285523e+00, 1.42857262e-01],
       ...,
       [1.43422425e-01, 1.42980751e-01, 1.42857440e-01, ...,
        1.42857288e-01, 1.42857495e-01, 1.42857288e-01],
       [2.39389070e+01, 8.11316842e+02, 1.42857451e-01, ...,
        1.42857295e-01, 1.42857510e-01, 1.42857295e-01],
       [1.43025367e-01, 1.43315775e-01, 1.42857390e-01, ...,
        1.42857264e-01, 1.42857437e-01, 1.42857264e-01]])

In [169]:
len(LDA.components_[0])

38663

In [170]:
single_topic = LDA.components_[0]

In [171]:
single_topic.argsort()  #return the indices of sort desc of the array 

array([ 6364, 34206,  1333, ..., 23683, 37509, 20019])

In [172]:
single_topic[34210] #word that least representative of this topic

0.1428572935388423

In [173]:
single_topic[6365]

0.1428789974950558

In [174]:
single_topic[4632] #word that most representative of this topic

2.141757741496415

In [175]:
top_words_indices = single_topic.argsort()[-20:]  #Top 10 words for this topic
#array([34210,  6365, 23296, ..., 26057, 17507,  4632])

In [176]:
for i in top_words_indices:
    print(cv.get_feature_names()[i])

card
know
read
free
books
learning
app
android
instagram
programming
account
language
facebook
improve
number
phone
english
new
way
learn


1st iteration: Below is the first result of top 20 words on the 1st iteration.
2nd iteration: I added good, best, did, does, use, using on the stopwords and iterate.

    
buy
used
career
examples
difference
free
company
using
mobile
software
google
app
android
engineering
does
good
use
phone
india
best

# Grab the words for each of  7 Topics

In [177]:
for index, topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['learning', 'app', 'android', 'instagram', 'programming', 'account', 'language', 'facebook', 'improve', 'number', 'phone', 'english', 'new', 'way', 'learn']


THE TOP 15 WORDS FOR TOPIC #1
['data', 'mechanical', 'marketing', 'engineer', 'time', 'career', 'software', 'math', 'movies', 'india', 'job', 'movie', 'engineering', 'love', 'difference']


THE TOP 15 WORDS FOR TOPIC #2
['answer', 'person', 'old', 'like', 'ask', 'long', 'don', 'question', 'girl', 'questions', 'mean', 'know', 'time', 'people', 'quora']


THE TOP 15 WORDS FOR TOPIC #3
['day', 'india', 'live', 'men', 'buy', 'stop', 'iphone', 'women', 'think', 'feel', 'thing', 'sex', 'people', 'like', 'life']


THE TOP 15 WORDS FOR TOPIC #4
['places', 'way', 'word', 'hair', 'water', 'meaning', 'win', 'hillary', 'increase', 'clinton', 'president', 'lose', 'donald', 'weight', 'trump']


THE TOP 15 WORDS FOR TOPIC #5
['energy', 'government', 'earn', 'rs', 'war', 'online', 'india', 'black', 'indian', '1000'

# Attaching Discovered Topic Labels to Original Articles

In [178]:
dtm

<404289x38663 sparse matrix of type '<class 'numpy.int64'>'
	with 1911337 stored elements in Compressed Sparse Row format>

In [179]:
dtm.shape

(404289, 38663)

In [180]:
len(quora)

404289

In [181]:
topic_results = LDA.transform(dtm)

In [182]:
topic_results.shape

(404289, 7)

In [183]:
topic_results[0]

array([0.2067921 , 0.01794019, 0.01786344, 0.01789378, 0.01788922,
       0.7037126 , 0.01790866])

In [190]:
topic_results[0].round(2)

array([0.21, 0.02, 0.02, 0.02, 0.02, 0.7 , 0.02])

In [189]:
topic_results[0].argmax()

5

In [None]:
This means that our model thinks that the first quora question belongs to topic #2.

# Combining Original Data

In [186]:
topic_results.argmax(axis = 1)

array([5, 2, 0, ..., 5, 6, 3])

In [187]:
quora['Topic'] = topic_results.argmax(axis = 1)

In [188]:
quora.head(10)

Unnamed: 0,Question,str_count,Topic
0,What is the step by step guide to invest in sh...,66,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,51,2
2,How can I increase the speed of my internet co...,73,0
3,Why am I mentally very lonely? How can I solve...,50,1
4,"Which one dissolve in water quikly sugar, salt...",76,4
5,Astrology: I am a Capricorn Sun Cap moon and c...,86,4
6,Should I buy tiago?,19,3
7,How can I be a good geologist?,30,3
8,When do you use シ instead of し?,31,2
9,Motorola (company): Can I hack my Charter Moto...,60,0


# Alternative 2: Non-negative Matrix Factorization
=======================================================================

In [147]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [199]:
tfidf = TfidfVectorizer(max_df = 0.9, min_df=2, stop_words= 'english')

In [200]:
dtm_nmf = tfidf.fit_transform(quora['Question'])

In [201]:
dtm_nmf

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

In [202]:
from sklearn.decomposition import NMF

In [237]:
nmf_model = NMF(n_components = 20, random_state = 42)

Error when fitting the model:

/home/erika/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/_nmf.py:1077: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  " improve convergence." % max_iter, ConvergenceWarning)

I added filterwarning below and repeat fitting the model

In [238]:
import warnings
warnings.filterwarnings('ignore', 'Solver terminated early.*')

In [239]:
nmf_model.fit(dtm_nmf)



NMF(n_components=20, random_state=42)

In [205]:
#Displaying Topics

In [240]:
len(tfidf.get_feature_names())

38669

In [241]:
#Displaying random words
for i in range(10):
    random_word_id = random.randint(0,38669)
    print(tfidf.get_feature_names()[random_word_id])

marshal
ganga
zealot
agility
rendertostring
bloodiest
whiteness
sedans
saf
obispo


In [242]:
for i in range(10):
    random_word_id = random.randint(0,38669)
    print(tfidf.get_feature_names()[random_word_id])

animal
fivesquid
fined
recognition
s4
emraan
space
alka
techstars
pinapansin


In [243]:
len(nmf_model.components_)

20

In [244]:
nmf_model.components_

array([[0.00000000e+00, 5.63036920e-02, 5.40156715e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.23892290e-03, 0.00000000e+00, 3.45251649e-05, ...,
        0.00000000e+00, 3.65013292e-03, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.07891142e-04, 4.92671304e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.84654148e-05, 4.54449173e-04, 6.05797981e-05, ...,
        1.70479939e-03, 0.00000000e+00, 1.70479939e-03],
       [3.45021413e-04, 0.00000000e+00, 4.81932600e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [252]:
single_topic_nmf=nmf_model.components_[0]

In [253]:
top_words_indices_nmf = single_topic_nmf.argsort()[-20:]

In [257]:
for i in top_words_indices_nmf:
    print (tfidf.get_feature_names()[i])

app
engineering
friend
website
site
thing
read
place
visit
places
phone
buy
laptop
movie
ways
2016
books
book
movies
best


# Display the top 15 most common words for each of the 7  & 20 topics.

In [225]:
for index, topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPICS # {index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])

THE TOP 15 WORDS FOR TOPICS # 0
['phone', 'india', 'lose', 'buy', 'laptop', 'time', 'movie', 'ways', '2016', 'weight', 'books', 'book', 'movies', 'way', 'best']
THE TOP 15 WORDS FOR TOPICS # 1
['new', 'compare', 'look', 'cost', 'really', 'girl', 'love', 'long', 'sex', 'time', 'work', 'feel', 'like', 'mean', 'does']
THE TOP 15 WORDS FOR TOPICS # 2
['post', 'answered', 'use', 'improvement', 'delete', 'easily', 'asked', 'google', 'answer', 'answers', 'ask', 'question', 'questions', 'people', 'quora']
THE TOP 15 WORDS FOR TOPICS # 3
['easiest', 'rupee', 'home', 'easy', 'notes', '1000', '500', 'black', 'youtube', 'ways', 'way', 'earn', 'online', 'make', 'money']
THE TOP 15 WORDS FOR TOPICS # 4
['moment', 'live', 'employees', 'like', 'want', 'real', 'love', 'things', 'day', 'important', 'thing', 'know', 'meaning', 'purpose', 'life']
THE TOP 15 WORDS FOR TOPICS # 5
['election', 'war', '1000', 'people', 'notes', '500', 'win', 'think', 'did', 'hillary', 'clinton', 'president', 'donald', 'trump'

In [247]:
for index, topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPICS # {index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])

THE TOP 15 WORDS FOR TOPICS # 0
['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']
THE TOP 15 WORDS FOR TOPICS # 1
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']
THE TOP 15 WORDS FOR TOPICS # 2
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']
THE TOP 15 WORDS FOR TOPICS # 3
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']
THE TOP 15 WORDS FOR TOPICS # 4
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']
THE TOP 15 WORDS FOR TOPICS # 5
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'country'

Attaching Discovered Topic Labels to Original Articles

In [260]:
dtm_nmf

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

In [261]:
dtm_nmf.shape

(404289, 38669)

In [263]:
len(quora)

404289

In [267]:
topic_result_nmf = nmf_model.transform(dtm_nmf)

In [269]:
topic_result_nmf[0]

array([2.75937605e-04, 5.91249293e-05, 6.17687040e-06, 4.95880678e-04,
       3.94126495e-05, 2.62022533e-02, 3.92318931e-04, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.34257472e-04, 1.15869110e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.97456870e-04,
       0.00000000e+00, 6.97269969e-04, 2.13527728e-04, 0.00000000e+00])

In [271]:
topic_result_nmf[0].round(2)

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.03, 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

In [273]:
topic_result_nmf[0].argmax()

5

#### TASK: Add a new column to the original quora dataframe that labels each question into one of the 20 topic categories.

In [54]:
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [296]:
quora['Topic_nmf_20'] = topic_result_nmf.argmax(axis=1)

In [300]:
quora_topics = quora[['Question','Topic_nmf_20']]

In [301]:
quora_topics.head(10)

Unnamed: 0,Question,Topic_nmf_20
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
5,Astrology: I am a Capricorn Sun Cap moon and c...,1
6,Should I buy tiago?,0
7,How can I be a good geologist?,10
8,When do you use シ instead of し?,19
9,Motorola (company): Can I hack my Charter Moto...,17


Takeaway: On this Quora case with 7 components/topics, NMF (over TF-IDF matrix) gives better result than with LDA (with TF matrix). 
The top keywords of the topics NFM finds are more related and meaningful to the context of Quora corpus. To achieve a similar result between  NMF and LDA, I had to iterate LDA by adding stopwords manually.

# Great job!