For this project I'll be working with a dataset of over 400.00 quora questions that have no labeled category.

In [2]:
# import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [4]:
# get the data
quora_df = pd.read_csv('quora_questions.csv')
quora_df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [6]:
# preprocessing
idf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = idf.fit_transform(quora_df['Question'])

In [10]:
# non - negative matrix factorization (20 topics)
nmf_model = NMF(n_components=20, random_state=42)
nmf_model.fit(dtm)



NMF(n_components=20, random_state=42)

In [12]:
# print out the top 15 most common words for each of the 20 topics
for i, topic in enumerate(nmf_model.components_):
  print(f"The top 15 words for # {i + 1} topic")
  print([idf.get_feature_names()[index] for index in topic.argsort()[-15:]])
  print("\n\n")

The top 15 words for # 1 topic




['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']



The top 15 words for # 2 topic
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']



The top 15 words for # 3 topic
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']



The top 15 words for # 4 topic
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']



The top 15 words for # 5 topic
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']



The top 15 words for # 6 topic
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'country', 'olympics', 'availab

In [14]:
# add topic results as a new column
topic_res = nmf_model.transform(dtm).argmax(axis=1)
quora_df['topics'] = topic_res
quora_df.head(15)

Unnamed: 0,Question,topics
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
5,Astrology: I am a Capricorn Sun Cap moon and c...,1
6,Should I buy tiago?,0
7,How can I be a good geologist?,10
8,When do you use シ instead of し?,19
9,Motorola (company): Can I hack my Charter Moto...,17
