In [None]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import sqlite3

In [None]:
base = 'stage.db' # SQLite database
table = 'Startups100' # table of interest

In [None]:
conn = sqlite3.connect('stage.db') # establishing connection to database
data = pd.read_sql("SELECT * FROM feat_" + table, conn) # reading table from database

In [None]:
documents = [] # list of all documents and their features as list
for name in data.Site.unique():
    dcm = list(data.loc[data.Site == name].TFIDF)
    documents.append(dcm)

In [None]:
distortions = [] # distortions for number of clusters
for i in range(0, int(len(documents) / 2)):
    model = KMeans(n_clusters=i+1)
    model.fit(documents)
    distortions.append(model.inertia_)

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(0,len(distortions)), distortions)
plt.plot([8, 8], [min(distortions), max(distortions)], '--')
plt.xlabel("Number of clusters")
plt.ylabel("Distortion score")
plt.show()

In [None]:
no_clusters = 8 # number of clusters to model

In [None]:
model = KMeans(n_clusters=no_clusters, random_state=42) # defining the k-means clustering model
model.fit(documents) # fitting the model to the data

In [None]:
# Creating a results table from the list of features and the clusters:
results = pd.DataFrame({'Term' : data.Term.unique()}).set_index('Term')
for i in range(0, no_clusters):
    results['Cluster ' + str(i)] = model.cluster_centers_[i]

In [None]:
results