In [1]:
import pandas as pd
from em_utilities import *
import sframe as sf
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
import scipy

In [2]:
dataset= sf.SFrame('KO_articles_tfidf.csv')
dataset.remove_column('X1')

[INFO] sframe.cython.cy_server: SFrame v2.1 started. Logging /tmp/sframe_server_1503498767.log


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,str,dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


category,text,tf_idf
business,Policy for Growth and InnovationI get asked ...,"{'stock': 3.248074979560463, 'r ..."
business,Sam AltmanThe most important story of 2014 ...,"{'petroyuan': 7.237059026124737, 'a ..."
business,Bubble talkI m tired of reading about investors ...,"{'talki': 7.237059026124737, ..."
business,A new team at redditLast week Yishan Wong ...,"{'ohanian': 7.237059026124737, ..."
business,Why Ops Is Taking Over Startup LandA little ...,"{'operations': 26.401277084533294, ..."
business,10 Data Acquisition Strategies for Startups ...,"{'exclusive': 3.28581530754331, ..."
business,One of the Greatest Entrepreneurial Stories ...,"{'all': 0.3912947246598535, ..."
business,2017 YC Annual LetterDear YC Community In ...,"{'represent': 2.583098675967214, 'a ..."
business,How to Build a Startup Ecosystem in Your ...,"{'all': 0.3912947246598535, ..."
business,Growth vs Profitability and Venture ReturnsThere ...,"{'all': 0.3912947246598535, ..."


In [3]:
tfidfvec= TfidfVectorizer(stop_words='english')
tf_idf_matrix= tfidfvec.fit_transform(dataset['text'])
# mapping = sf.SFrame('Map_index_words.csv')
# tfidf_df = pd.read_csv('KO_tfidf.csv')
# tfidf_df.set_index('0', inplace=True)

In [4]:
countvec = CountVectorizer(stop_words='english')
wordvec_matrix= countvec.fit_transform(dataset['text'])

In [5]:
wordvec_matrix = normalize(wordvec_matrix)



In [None]:
# tfidf = scipy.sparse.csr_matrix(tfidf_df.values)

In [6]:
tf_idf_matrix = normalize(tf_idf_matrix)

## Model Parameters smart initialization

In [8]:
#Smart Initialization for means
def initialize_means(num_clusters,features_matrix):
    from sklearn.cluster import KMeans
    np.random.seed(5)
    kmeans_model = KMeans(n_clusters=num_clusters, n_init=5, max_iter=400, random_state=1, n_jobs=1)
    kmeans_model.fit(features_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
    means = [centroid for centroid in centroids]
    return [means , cluster_assignment]

In [9]:
#Smart initialization for weights
def initialize_weights(num_clusters,features_matrix):
    num_docs = features_matrix.shape[0]
    weights = []
    for i in xrange(num_clusters):
        num_assigned = len(cluster_assignment[cluster_assignment==i]) # YOUR CODE HERE
        w = float(num_assigned) / num_docs
        weights.append(w)
    return weights

In [10]:
#Smart initialization for covariances
def initialize_covs(num_clusters,features_matrix):
    covs = []
    for i in xrange(num_clusters):
        member_rows = features_matrix[cluster_assignment==i]
        cov = (member_rows.multiply(member_rows) - 2*member_rows.dot(diag(means[i]))).sum(axis=0).A1 / member_rows.shape[0] \
        + means[i]**2
        cov[cov < 1e-8] = 1e-8
        covs.append(cov)
    return covs

## Visualize words of each cluster

In [11]:
# Fill in the blanks
def visualize_EM_clusters(tf_idf, means, covs, map_index_to_word):
    print('')
    print('==========================================================')

    num_clusters = len(means)
    for c in xrange(num_clusters):
        print('Cluster {0:d}: Largest mean parameters in cluster '.format(c))
        print('\n{0: <12}{1: <12}{2: <12}'.format('Word', 'Mean', 'Variance'))
        
        # The k'th element of sorted_word_ids should be the index of the word 
        # that has the k'th-largest value in the cluster mean. Hint: Use np.argsort().
        sorted_word_ids = np.argsort(means[c])[::-1]  # YOUR CODE HERE

        for i in sorted_word_ids[:10]:
            print '{0: <12}{1:<10.2e}{2:10.2e}'.format(map_index_to_word[i], 
                                                       means[c][i],
                                                       covs[c][i])
        print '\n=========================================================='

In [32]:
def clusters_report(clusters_idx):
    cluster_id=0
    for cluster_indicies in clusters_idx:
        countP=0
        countB=0
        countE=0
        for i in cluster_indicies:
            if dataset['category'][i]=='product':
                countP+=1
            elif dataset['category'][i]=='engineering':
                countE+=1
            elif dataset['category'][i]=='business':
                countB+=1
        print "Cluster ",cluster_id ,"\n==========================\n"
        cluster_id+=1
        print "product count : ",countP ,"\nengineering count : ",countE,"\nbusiness count : ",countB , "\n"
    

In [13]:
business_set= dataset[dataset['category']=='business']
engineering_set= dataset[dataset['category']=='engineering']
product_set= dataset[dataset['category']=='product']
print "Business data: ", len(business_set)
print "Engineering data: ", len(engineering_set)
print "Product data: ", len(product_set)

Business data:  561
Engineering data:  277
Product data:  552


In [14]:
# Model 1 with 3 clusters (Main categories)
(means , cluster_assignment)= initialize_means(3,tf_idf_matrix)
covs= initialize_covs(3,tf_idf_matrix)
weights= initialize_weights(3,tf_idf_matrix)
model_em_3k= EM_for_high_dimension(tf_idf_matrix, means, covs, weights, cov_smoothing=1e-10)

In [15]:
# Model 2 with 10 clusters.
(means , cluster_assignment)= initialize_means(10,tf_idf_matrix)
covs= initialize_covs(10,tf_idf_matrix)
weights= initialize_weights(10,tf_idf_matrix)
model_em_10k= EM_for_high_dimension(tf_idf_matrix, means, covs, weights, cov_smoothing=1e-10)

In [34]:
visualize_EM_clusters(tf_idf_matrix, model_em_3k['means'], model_em_3k['covs'], tfidfvec.get_feature_names())


Cluster 0: Largest mean parameters in cluster 

Word        Mean        Variance    
data        3.18e-02    4.91e-03
netflix     2.82e-02    1.68e-02
companies   2.01e-02    1.54e-03
time        1.90e-02    4.41e-04
business    1.86e-02    1.44e-03
code        1.83e-02    2.45e-03
like        1.82e-02    3.42e-04
service     1.80e-02    1.99e-03
company     1.76e-02    1.21e-03
new         1.75e-02    5.12e-04

Cluster 1: Largest mean parameters in cluster 

Word        Mean        Variance    
product     2.44e-01    1.01e-02
customer    5.52e-02    5.85e-03
team        5.38e-02    4.21e-03
customers   5.38e-02    4.83e-03
manager     4.77e-02    5.18e-03
management  4.11e-02    4.32e-03
managers    3.98e-02    3.60e-03
products    3.61e-02    1.66e-03
users       3.55e-02    3.16e-03
market      3.50e-02    4.41e-03

Cluster 2: Largest mean parameters in cluster 

Word        Mean        Variance    
design      7.03e-02    1.19e-02
team        5.86e-02    5.65e-03
people      5.54

In [33]:
visualize_EM_clusters(tf_idf_matrix, model_em_10k['means'], model_em_10k['covs'], tfidfvec.get_feature_names())


Cluster 0: Largest mean parameters in cluster 

Word        Mean        Variance    
company     4.83e-02    3.26e-03
startup     4.27e-02    4.71e-03
people      4.07e-02    1.93e-03
business    3.88e-02    2.90e-03
companies   3.66e-02    2.38e-03
investors   2.95e-02    3.70e-03
time        2.83e-02    6.75e-04
founders    2.70e-02    3.10e-03
like        2.63e-02    5.71e-04
market      2.49e-02    1.96e-03

Cluster 1: Largest mean parameters in cluster 

Word        Mean        Variance    
data        5.34e-02    8.40e-03
code        3.72e-02    4.48e-03
microservices3.05e-02    1.23e-02
services    2.43e-02    3.18e-03
service     2.29e-02    3.02e-03
serverless  2.12e-02    1.07e-02
use         2.01e-02    4.50e-04
new         1.96e-02    5.97e-04
time        1.93e-02    4.59e-04
application 1.89e-02    1.21e-03

Cluster 2: Largest mean parameters in cluster 

Word        Mean        Variance    
product     2.28e-01    1.03e-02
customer    5.76e-02    5.95e-03
customers   5.7

## Clustering report for each model

In [35]:
#Cluster assignments for first model with 3 clusters 
resps_3k= sf.SFrame(model_em_3k['resp'])
resps_3k= resps_3k.unpack('X1', '')
cluster_id=0
for col in resps_3k.column_names():
    cluster_3k= np.array(resps_3k[col])
    print "cluster ",cluster_id , "assignments: ", cluster_3k.sum()
    cluster_id+=1

cluster  0 assignments:  635.0
cluster  1 assignments:  225.0
cluster  2 assignments:  530.0


In [36]:
#Cluster assignments for second model with 10 clusters 
resps_10k= sf.SFrame(model_em_10k['resp'])
resps_10k= resps_10k.unpack('X1', '')
cluster_id=0
for col in resps_10k.column_names():
    cluster_10k= np.array(resps_10k[col])
    print "cluster ",cluster_id , "assignments: ", cluster_10k.sum()
    cluster_id+=1

cluster  0 assignments:  358.0
cluster  1 assignments:  301.0
cluster  2 assignments:  257.0
cluster  3 assignments:  127.0
cluster  4 assignments:  17.0
cluster  5 assignments:  155.0
cluster  6 assignments:  18.0
cluster  7 assignments:  25.0
cluster  8 assignments:  121.0
cluster  9 assignments:  11.0


In [37]:
# Articles' categories in model 1 with 3 clusters
clusters_3k_idx=[]
for col in resps_3k.column_names():
    cluster_3k= np.array(resps_3k[col])
    cluster_3k= cluster_3k.nonzero()[0]
    clusters_3k_idx.append(cluster_3k)
clusters_report(clusters_3k_idx)

Cluster  0 

product count :  92 
engineering count :  263 
business count :  280 

Cluster  1 

product count :  179 
engineering count :  0 
business count :  46 

Cluster  2 

product count :  281 
engineering count :  14 
business count :  235 



In [38]:
# Articles' categories in model 2 with 10 clusters
clusters_10k_idx=[]
for col in resps_10k.column_names():
    cluster_10k= np.array(resps_10k[col])
    cluster_10k= cluster_10k.nonzero()[0]
    clusters_10k_idx.append(cluster_10k)
clusters_report(clusters_10k_idx)

Cluster  0 

product count :  49 
engineering count :  6 
business count :  303 

Cluster  1 

product count :  32 
engineering count :  235 
business count :  34 

Cluster  2 

product count :  207 
engineering count :  0 
business count :  50 

Cluster  3 

product count :  88 
engineering count :  3 
business count :  36 

Cluster  4 

product count :  7 
engineering count :  0 
business count :  10 

Cluster  5 

product count :  45 
engineering count :  8 
business count :  102 

Cluster  6 

product count :  15 
engineering count :  2 
business count :  1 

Cluster  7 

product count :  1 
engineering count :  23 
business count :  1 

Cluster  8 

product count :  108 
engineering count :  0 
business count :  13 

Cluster  9 

product count :  0 
engineering count :  0 
business count :  11 

