In [1]:
import pandas as pd 

#importing dataset
quora = pd.read_csv('quora_questions.csv')
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [2]:
# data preprocessing

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df = 0.95, min_df=2, stop_words='english')

dtm = tfidf.fit_transform(quora['Question'])

In [3]:
# non-negative matrix factorization

from sklearn.decomposition import NMF

nmf_model = NMF(n_components=20)

nmf_model.fit(dtm)



NMF(n_components=20)

In [8]:
# printing the top 20 most common words for each of the 20 topics
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 20 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

THE TOP 20 WORDS FOR TOPIC #0
['app', 'engineering', 'friend', 'website', 'site', 'thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']


THE TOP 20 WORDS FOR TOPIC #1
['come', 'relationship', 'says', 'universities', 'grads', 'majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


THE TOP 20 WORDS FOR TOPIC #2
['users', 'writer', 'marked', 'search', 'use', 'add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


THE TOP 20 WORDS FOR TOPIC #3
['com', 'facebook', 'job', 'easiest', 'making', 'using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']


THE TOP 20 WORDS FOR TOPIC #4
['embarrassing', 'decision', 'biggest', 'work', 'did', 'balance', 'earth',

In [17]:
# adding a label to each topic
topic_label = {0:'Technology', 1:'Relationships', 2:'Q&A', 3:'Investing', 4:'Discussions', 5:'Politics', 6:'Programming', 7:'US Politics', 8:'Politics', 9:'Work Culture', 10:'Engineering', 11:'India', 12:'Social Media', 13:'Communication', 14:'Health', 15:'Movies', 16:'Relationships', 17:'Technology', 18:'Software Engineering', 19:'Foreign'}

In [18]:
# adding a new column to the dataframe
topic_results = nmf_model.transform(dtm)

topic_results.argmax(axis=1)

quora['Topic'] = topic_results.argmax(axis=1)
quora['Label'] = quora['Topic'].map(topic_label)

quora.head()

Unnamed: 0,Question,Topic,Label
0,What is the step by step guide to invest in sh...,5,Politics
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16,Relationships
2,How can I increase the speed of my internet co...,17,Technology
3,Why am I mentally very lonely? How can I solve...,11,India
4,"Which one dissolve in water quikly sugar, salt...",14,Health
