In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF # alternative to Latent Dirichlet approach
import pandas as pd
import numpy as np
import sqlite3

In [None]:
base = 'stage.db' # SQLite database
table = 'Startups100' # table of interest

In [None]:
conn = sqlite3.connect('stage.db') # establishing connection to database
data = pd.read_sql("SELECT * FROM token_" + table, conn) # reading table from database

In [None]:
min_df = 0.05 # minimum document frequency of term
max_df = 0.90 # maximum document frequency of term

In [None]:
vec = CountVectorizer(max_df=max_df, min_df=min_df) # defining the vectorizer
vec.fit_transform(data['Text']) # vectorizing the text data
tf = vec.fit_transform(data['Text']).toarray() # exctracting term frequencies
features = vec.get_feature_names() # extracting all feature names

In [None]:
scores = [] # perplexity scores for number of topics
for i in range(1, 25):
    model = LatentDirichletAllocation(n_components=i, random_state=42) # defining the Latent Dirichlet Allocation model
    model.fit(tf) # fitting the model to the data
    scores.append(model.bound_)

In [None]:
import matplotlib.pyplot as plt
plt.plot(topics, scores)
plt.plot([4, 4], [min(scores), max(scores)], '--')
plt.xticks([i for i in topics if i % 2 == 0])
plt.show()

In [None]:
no_topics = 4 # number of topics to model
no_terms = 10 # number of terms to describe topic

In [None]:
model = LatentDirichletAllocation(n_components=no_topics, random_state=42) # defining the Latent Dirichlet Allocation model
model.fit(tf) # fitting the model to the data

In [None]:
topic_dict = {}
for topic_id, topic in enumerate(model.components_):
    topic_dict['Topic %d word' % (topic_id)] = ['{}'.format(features[i]) for i in topic.argsort()[:-10 - 1:-1]]
    topic_dict['Topic %d weight' % (topic_id)] = ['{}'.format(topic[i]) for i in topic.argsort()[:-10 - 1:-1]]
results = pd.DataFrame(topic_dict)

In [None]:
results