In [9]:
import numpy as np
import spacy
import pandas as pd
import sklearn


In [3]:
df = pd.read_csv('quora_questions.csv')

In [4]:
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [5]:
df['Question']

0         What is the step by step guide to invest in sh...
1         What is the story of Kohinoor (Koh-i-Noor) Dia...
2         How can I increase the speed of my internet co...
3         Why am I mentally very lonely? How can I solve...
4         Which one dissolve in water quikly sugar, salt...
                                ...                        
404284    How many keywords are there in the Racket prog...
404285            Do you believe there is life after death?
404286                                    What is one coin?
404287    What is the approx annual cost of living while...
404288                What is like to have sex with cousin?
Name: Question, Length: 404289, dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404289 entries, 0 to 404288
Data columns (total 1 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Question  404289 non-null  object
dtypes: object(1)
memory usage: 3.1+ MB


In [8]:
df.isnull().sum()

Question    0
dtype: int64

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf = TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [17]:
dtm = tfidf.fit_transform(df['Question'])

In [22]:
dtm    #document-term-matrix

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

In [19]:
from sklearn.decomposition import NMF

In [23]:
nmf_model = NMF(n_components=20,
    init=None,
    solver='cd',
    beta_loss='frobenius',
    tol=0.0001,
    max_iter=200,
    random_state=42,
    alpha=0.0,
    l1_ratio=0.0,
    verbose=0,
    shuffle=False)

In [25]:
nmf_model.fit(dtm)



NMF(n_components=20, random_state=42)

In [28]:
nmf_model.components_

array([[0.00000000e+00, 5.63036920e-02, 5.40156715e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.23892290e-03, 0.00000000e+00, 3.45251649e-05, ...,
        0.00000000e+00, 3.65013292e-03, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.07891142e-04, 4.92671304e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.84654148e-05, 4.54449173e-04, 6.05797981e-05, ...,
        1.70479939e-03, 0.00000000e+00, 1.70479939e-03],
       [3.45021413e-04, 0.00000000e+00, 4.81932600e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [30]:
for index,topic in enumerate(nmf_model.components_):
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']


['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']


['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']


['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'country', 'olympics', 'available', 'job', 'spotify', 'war', 'pakistan', 'india']


['beginners', 'online', 'english', 'book', 'did', 'hacking', 'want', 'python', 'languages', 'java', 'learni

In [31]:
topic_results = nmf_model.transform(dtm)

In [32]:
topic_results.argmax(axis=1)

array([ 5, 16, 17, ..., 11, 11,  9], dtype=int64)

In [33]:
df['topic'] = topic_results.argmax(axis=1)

In [34]:
df.head()

Unnamed: 0,Question,topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
