In [1]:
import pandas as pd

# read people wiki data

In [2]:
df = pd.read_csv('people_wiki.csv')

In [3]:
df.head(10)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...
5,<http://dbpedia.org/resource/Sam_Henderson>,Sam Henderson,sam henderson born october 18 1969 is an ameri...
6,<http://dbpedia.org/resource/Aaron_LaCrate>,Aaron LaCrate,aaron lacrate is an american music producer re...
7,<http://dbpedia.org/resource/Trevor_Ferguson>,Trevor Ferguson,trevor ferguson aka john farrow born 11 novemb...
8,<http://dbpedia.org/resource/Grant_Nelson>,Grant Nelson,grant nelson born 27 april 1971 in london also...
9,<http://dbpedia.org/resource/Cathy_Caruth>,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes pro...


# select text column

In [4]:
desc = df['text']

In [5]:
desc.dropna(inplace=True)

# Perform tf-idf vectorization on text column

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=1000,
                                   stop_words='english')

In [7]:
tfidf = tfidf_vectorizer.fit_transform(desc)

# Perform NMF fitting over tf-idf vectors

In [8]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=10, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)

In [9]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [10]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, 20)

Topic #0: book published books new novel magazine writing writer radio author poetry york editor news series times press fiction written stories
Topic #1: league season played football games team club baseball coach game player career seasons playing signed major goals professional cup draft
Topic #2: album band released song records albums songs rock music singer single solo recorded guitar bands record label recording guitarist release
Topic #3: party minister election elected member parliament government politician candidate assembly leader seat liberal council prime democratic political general elections cabinet
Topic #4: film films television series actor directed role theatre award festival best actress appeared feature tv drama director production comedy roles
Topic #5: world won championships team championship olympics tour cup champion finished medal race olympic racing event competed european title place time
Topic #6: art museum gallery work artist new arts york design exhib

## Check out categories of people that you can infer from above keywords like

## topic 0: accomplished literary person
## topic 1: professional league player
## topic 2: accomplished musician
## etc.

# Perform bag of words vectorization on text column

In [11]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=1000,
                                stop_words='english')

In [12]:
tf = tf_vectorizer.fit_transform(desc)

# Perform LDA fitting over bag of words vectors

In [13]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [14]:
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [15]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 20)

Topic #0: music album band released records song songs rock singer albums record recorded born known solo single recording guitar race group
Topic #1: art new radio work museum television news design york artist arts media including australian australia gallery london worked zealand works
Topic #2: university professor research college institute science studies school member received international american society national phd fellow director degree award education
Topic #3: international world health human rights years people south women india medical national social time indian africa african family known chinese
Topic #4: film television series theatre award films best role festival appeared directed actor director tv production born won actress comedy drama
Topic #5: party member election minister elected born state served government council president house committee politician political general democratic national appointed leader
Topic #6: music new born school york years orchest

## Compare above categories to that earlier

# Perform LDA fitting over tf-idf vectors

In [16]:
lda.fit(tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [17]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(lda, tfidf_feature_names, 20)

Topic #0: league baseball games major season played runs game minor career era home seasons signed hit player draft professional record red
Topic #1: film television series films radio theatre tv actor role appeared award best directed actress comedy new drama production producer festival
Topic #2: university research professor president science institute international board director served member law development school business national society technology college american
Topic #3: party election minister served member elected state president government general district court law politician committee council appointed house democratic political
Topic #4: world won championships championship team tour olympics racing champion race olympic finished competed event title record professional medal time win
Topic #5: football season league played team coach club cup player rugby games basketball playing game hockey career goals seasons signed footballer
Topic #6: art music orchestra opera m