In [1]:
from em_utilities import *
import sframe as sf
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
from sklearn.neighbors import NearestNeighbors
import scipy
import time

# Section 0:
## Dataset definition and feature extraction (tf-idf)

In [2]:
dataset= sf.SFrame('KO_articles_tfidf.csv')

[INFO] sframe.cython.cy_server: SFrame v2.1 started. Logging /tmp/sframe_server_1504103005.log


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,str,dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [3]:
tfidfvec= TfidfVectorizer(stop_words='english')
tf_idf_matrix= tfidfvec.fit_transform(dataset['text'])
tf_idf_matrix = normalize(tf_idf_matrix)

# Section 1: 
## Model Parameters smart initialization

Used Kmeans++ model to initialize the parameters for the model of EM algorithm.
- Kmeans++ used to initialize the means (Centroids of clusters)

In [4]:
#Smart Initialization for means with using KMeans++ model 
def initialize_means(num_clusters,features_matrix):
    from sklearn.cluster import KMeans
    np.random.seed(5)
    kmeans_model = KMeans(n_clusters=num_clusters, init='k-means++', n_init=5, max_iter=400, random_state=1, n_jobs=1)
    kmeans_model.fit(features_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
    means = [centroid for centroid in centroids]
    return [means , cluster_assignment]

In [5]:
#Smart initialization for weights
def initialize_weights(num_clusters,features_matrix,cluster_assignment):
    num_docs = features_matrix.shape[0]
    weights = []
    for i in xrange(num_clusters):
        num_assigned = len(cluster_assignment[cluster_assignment==i]) # YOUR CODE HERE
        w = float(num_assigned) / num_docs
        weights.append(w)
    return weights

In [6]:
#Smart initialization for covariances
def initialize_covs(num_clusters,features_matrix,cluster_assignment):
    covs = []
    for i in xrange(num_clusters):
        member_rows = features_matrix[cluster_assignment==i]
        cov = (member_rows.multiply(member_rows) - 2*member_rows.dot(diag(means[i]))).sum(axis=0).A1 / member_rows.shape[0] \
        + means[i]**2
        cov[cov < 1e-8] = 1e-8
        covs.append(cov)
    return covs

# Section 2:
## Training Models with different number of clusters

Initializing the parameters for each model then start training using the Expectation-Maximization algorithm.

In [7]:
# Model 1 with 10 clusters
(means , cluster_assignment_10model)= initialize_means(10,tf_idf_matrix)
covs= initialize_covs(10,tf_idf_matrix, cluster_assignment_10model)
weights= initialize_weights(10,tf_idf_matrix, cluster_assignment_10model)
model_em_10k= EM_for_high_dimension(tf_idf_matrix, means, covs, weights, cov_smoothing=1e-10)

In [8]:
# Model 2 with 20 clusters.
(means , cluster_assignment_20model)= initialize_means(20,tf_idf_matrix)
covs= initialize_covs(20,tf_idf_matrix, cluster_assignment_20model)
weights= initialize_weights(20,tf_idf_matrix, cluster_assignment_20model)
model_em_20k= EM_for_high_dimension(tf_idf_matrix, means, covs, weights, cov_smoothing=1e-10)

# Section 3:
## Evaluation report for each cluster (Interpreting clusters)

Evaluation report is divided into two partitions the first one is the word representation for each cluster the really interpret the cluster, the second one is for the variety of article types in one cluster counting each category for each cluster.

In [9]:
def visualize_EM_clusters(tf_idf, means, covs, map_index_to_word):
    print('')
    print('==========================================================')

    num_clusters = len(means)
    for c in xrange(num_clusters):
        print('Cluster {0:d}: Largest mean parameters in cluster '.format(c))
        print('\n{0: <12}{1: <12}{2: <12}'.format('Word', 'Mean', 'Variance'))
        
        # The k'th element of sorted_word_ids should be the index of the word 
        # that has the k'th-largest value in the cluster mean. Hint: Use np.argsort().
        sorted_word_ids = np.argsort(means[c])[::-1]

        for i in sorted_word_ids[:10]:
            print '{0: <12}{1:<10.2e}{2:10.2e}'.format(map_index_to_word[i], 
                                                       means[c][i],
                                                       covs[c][i])
        print '\n=========================================================='

In [10]:
def clusters_report(clusters_idx):
    cluster_id=0
    for cluster_indicies in clusters_idx:
        countP=0
        countB=0
        countE=0
        for i in cluster_indicies:
            if dataset['category'][i]=='product':
                countP+=1
            elif dataset['category'][i]=='engineering':
                countE+=1
            elif dataset['category'][i]=='business':
                countB+=1
        print "Cluster ",cluster_id ,"\n==========================\n"
        cluster_id+=1
        print "product count : ",countP ,"\nengineering count : ",countE,"\nbusiness count : ",countB , "\n"
    

In [11]:
visualize_EM_clusters(tf_idf_matrix, model_em_10k['means'], model_em_10k['covs'], tfidfvec.get_feature_names())


Cluster 0: Largest mean parameters in cluster 

Word        Mean        Variance    
company     4.83e-02    3.26e-03
startup     4.27e-02    4.71e-03
people      4.07e-02    1.93e-03
business    3.88e-02    2.90e-03
companies   3.66e-02    2.38e-03
investors   2.95e-02    3.70e-03
time        2.83e-02    6.75e-04
founders    2.70e-02    3.10e-03
like        2.63e-02    5.71e-04
market      2.49e-02    1.96e-03

Cluster 1: Largest mean parameters in cluster 

Word        Mean        Variance    
data        5.34e-02    8.40e-03
code        3.72e-02    4.48e-03
microservices3.05e-02    1.23e-02
services    2.43e-02    3.18e-03
service     2.29e-02    3.02e-03
serverless  2.12e-02    1.07e-02
use         2.01e-02    4.50e-04
new         1.96e-02    5.97e-04
time        1.93e-02    4.59e-04
application 1.89e-02    1.21e-03

Cluster 2: Largest mean parameters in cluster 

Word        Mean        Variance    
product     2.28e-01    1.03e-02
customer    5.76e-02    5.95e-03
customers   5.7

In [12]:
visualize_EM_clusters(tf_idf_matrix, model_em_20k['means'], model_em_20k['covs'], tfidfvec.get_feature_names())


Cluster 0: Largest mean parameters in cluster 

Word        Mean        Variance    
pitch       1.20e-01    2.29e-02
story       1.01e-01    2.12e-02
investors   8.74e-02    1.20e-02
investor    6.82e-02    1.29e-02
startup     6.59e-02    3.21e-02
start       5.06e-02    1.60e-02
company     5.06e-02    3.31e-03
founders    4.96e-02    6.69e-03
product     4.72e-02    3.23e-03
habit       4.40e-02    1.15e-02

Cluster 1: Largest mean parameters in cluster 

Word        Mean        Variance    
people      7.25e-02    4.49e-03
work        4.99e-02    3.34e-03
time        4.81e-02    1.08e-03
team        4.79e-02    2.55e-03
things      3.91e-02    1.74e-03
company     3.79e-02    2.43e-03
ve          3.70e-02    1.48e-03
like        3.67e-02    9.08e-04
buffer      3.39e-02    9.56e-03
just        3.36e-02    8.72e-04

Cluster 2: Largest mean parameters in cluster 

Word        Mean        Variance    
netflix     6.58e-01    4.27e-03
blog        2.51e-01    7.75e-03
technology  1.75

In [13]:
# No. of articles in each cluster for first model with 10 clusters
resps_10k= sf.SFrame(model_em_10k['resp'])
resps_10k= resps_10k.unpack('X1', '')
cluster_id=0
cluster_hash_10model = {}
for col in resps_10k.column_names():
    cluster_10k= np.array(resps_10k[col])
    print "cluster ",cluster_id , "assignments: ", cluster_10k.sum()
    cluster_hash_10model[cluster_id] =cluster_10k.nonzero() 
    cluster_id+=1

cluster  0 assignments:  358.0
cluster  1 assignments:  301.0
cluster  2 assignments:  257.0
cluster  3 assignments:  127.0
cluster  4 assignments:  17.0
cluster  5 assignments:  155.0
cluster  6 assignments:  18.0
cluster  7 assignments:  25.0
cluster  8 assignments:  121.0
cluster  9 assignments:  11.0


In [14]:
# No. of articles in each cluster for second model with 20 clusters
resps_20k= sf.SFrame(model_em_20k['resp'])
resps_20k= resps_20k.unpack('X1', '')
cluster_id=0
cluster_hash_20model = {}
for col in resps_20k.column_names():
    cluster_20k= np.array(resps_20k[col])
    print "cluster ",cluster_id , "assignments: ", cluster_20k.sum()
    cluster_hash_20model[cluster_id] =cluster_20k.nonzero() 
    cluster_id+=1

cluster  0 assignments:  21.0
cluster  1 assignments:  165.0
cluster  2 assignments:  25.0
cluster  3 assignments:  45.0
cluster  4 assignments:  112.0
cluster  5 assignments:  90.0
cluster  6 assignments:  160.0
cluster  7 assignments:  97.0
cluster  8 assignments:  38.0
cluster  9 assignments:  51.0
cluster  10 assignments:  26.0
cluster  11 assignments:  89.0
cluster  12 assignments:  32.0
cluster  13 assignments:  134.0
cluster  14 assignments:  47.0
cluster  15 assignments:  153.0
cluster  16 assignments:  47.0
cluster  17 assignments:  21.0
cluster  18 assignments:  26.0
cluster  19 assignments:  11.0


In [15]:
# Articles' categories in model 1 with 10 clusters
clusters_10k_idx=[]
for col in resps_10k.column_names():
    cluster_10k= np.array(resps_10k[col])
    cluster_10k= cluster_10k.nonzero()[0]
    clusters_10k_idx.append(cluster_10k)
clusters_report(clusters_10k_idx)

Cluster  0 

product count :  49 
engineering count :  6 
business count :  303 

Cluster  1 

product count :  32 
engineering count :  235 
business count :  34 

Cluster  2 

product count :  207 
engineering count :  0 
business count :  50 

Cluster  3 

product count :  88 
engineering count :  3 
business count :  36 

Cluster  4 

product count :  7 
engineering count :  0 
business count :  10 

Cluster  5 

product count :  45 
engineering count :  8 
business count :  102 

Cluster  6 

product count :  15 
engineering count :  2 
business count :  1 

Cluster  7 

product count :  1 
engineering count :  23 
business count :  1 

Cluster  8 

product count :  108 
engineering count :  0 
business count :  13 

Cluster  9 

product count :  0 
engineering count :  0 
business count :  11 



In [16]:
# Articles' categories in model 2 with 20 clusters
clusters_20k_idx=[]
for col in resps_20k.column_names():
    cluster_20k= np.array(resps_20k[col])
    cluster_20k= cluster_20k.nonzero()[0]
    clusters_20k_idx.append(cluster_20k)
clusters_report(clusters_20k_idx)

Cluster  0 

product count :  3 
engineering count :  0 
business count :  18 

Cluster  1 

product count :  41 
engineering count :  11 
business count :  113 

Cluster  2 

product count :  1 
engineering count :  23 
business count :  1 

Cluster  3 

product count :  27 
engineering count :  0 
business count :  18 

Cluster  4 

product count :  2 
engineering count :  0 
business count :  110 

Cluster  5 

product count :  13 
engineering count :  3 
business count :  74 

Cluster  6 

product count :  11 
engineering count :  141 
business count :  8 

Cluster  7 

product count :  91 
engineering count :  0 
business count :  6 

Cluster  8 

product count :  38 
engineering count :  0 
business count :  0 

Cluster  9 

product count :  21 
engineering count :  0 
business count :  30 

Cluster  10 

product count :  1 
engineering count :  24 
business count :  1 

Cluster  11 

product count :  78 
engineering count :  1 
business count :  10 

Cluster  12 

product count 

# Section 4
## Recommendation and predictions for Articles

#### Recommendation method: 
A method for recommending articles by retrieving the cluster that the article belong to, then fetch all the articles in that cluster articles passed to nearest neighbour model to find the best 10 articles recommended for this article.

#### Predicting method:
Sending set of articles to predict the cluster it belong based on the trained data 


- Using the test dataset to predict cluster for each one using two different models.

In [17]:
def articles_inds(article_id , cluster_hash_model):
    for cluster_id in cluster_hash_model: 
        np_array = np.array(cluster_hash_model[cluster_id])
        if article_id in np_array:
            return cluster_id, np_array

In [18]:
def recommender(article_id ,cluster_hash_model, no_articles, data_articles):
    start_time = time.time()
    cid , inds = articles_inds(article_id ,cluster_hash_model)
    cluster_articles= data_articles.filter_by(inds[0] , 'X1')
    cluster_articles = cluster_articles.add_row_number()

    recom_vec= TfidfVectorizer(stop_words='english')
    tfidf_recommend= recom_vec.fit_transform(cluster_articles['text'])
    tfidf_recommend = normalize(tfidf_recommend)
    
    row_id = cluster_articles[cluster_articles['X1']==article_id]['id'][0]
    NN_model = NearestNeighbors(n_neighbors=no_articles).fit(tfidf_recommend)
    distances, indices = NN_model.kneighbors(tfidf_recommend[row_id])
    
    recommended_ids=[]
    for i in indices[0]:
        recommended_ids.append(cluster_articles[cluster_articles['id']==i]['X1'][0])
    
    del cluster_articles
    del tfidf_recommend
    #print("--- %s seconds ---" % (time.time() - start_time))
    #print len(inds[0])
    return recommended_ids

In [19]:
def predict_cluster(articles,em_model):
    article_tfidf= tfidfvec.transform(articles['text'])
    mu= deepcopy(em_model['means'])
    sigma= deepcopy(em_model['covs'])
    assignments=[]
    for j in range(article_tfidf.shape[0]):
        resps=[]
        for i in range(len(em_model['weights'])):
            predict= np.log(em_model['weights'][i]) + logpdf_diagonal_gaussian(article_tfidf[j], mu[i],sigma[i])
            resps.append(predict)
        assignments.append(resps.index(np.max(resps)))
    return assignments

In [20]:
# Recommend articles for all dataset then append it into the SFrame database then export it.
recommended_inds = []
start_time = time.time()
for i in range(len(dataset)):
    recommended_inds.append(recommender(i,cluster_hash_20model,11,dataset))

print("--- %s seconds (Final time complexity): ---" % (time.time() - start_time))

--- 608.989954948 seconds (Final time complexity): ---


In [21]:
rec_inds= sf.SArray(recommended_inds)
dataset.add_column(rec_inds,name='recommendations')

X1,category,text,tf_idf,recommendations
0,business,Policy for Growth and InnovationI get asked ...,"{'stock': 3.248074979560463, 'r ...","[0.0, 1.0, 217.0, 7.0, 609.0, 506.0, 210.0, ..."
1,business,Sam AltmanThe most important story of 2014 ...,"{'petroyuan': 7.237059026124737, 'a ...","[1.0, 0.0, 7.0, 506.0, 210.0, 130.0, 1067.0, ..."
2,business,Bubble talkI m tired of reading about investors ...,"{'talki': 7.237059026124737, ...","[2.0, 236.0, 461.0, 7.0, 0.0, 77.0, 441.0, 407.0, ..."
3,business,A new team at redditLast week Yishan Wong ...,"{'ohanian': 7.237059026124737, ...","[3.0, 1038.0, 222.0, 132.0, 550.0, 404.0, ..."
4,business,Why Ops Is Taking Over Startup LandA little ...,"{'operations': 26.401277084533294, ...","[4.0, 680.0, 1079.0, 735.0, 5.0, 138.0, 67 ..."
5,business,10 Data Acquisition Strategies for Startups ...,"{'exclusive': 3.28581530754331, ...","[5.0, 476.0, 580.0, 680.0, 1079.0, 138.0, ..."
6,business,One of the Greatest Entrepreneurial Stories ...,"{'all': 0.3912947246598535, ...","[6.0, 164.0, 984.0, 388.0, 916.0, 67.0, ..."
7,business,2017 YC Annual LetterDear YC Community In ...,"{'represent': 2.583098675967214, 'a ...","[7.0, 77.0, 407.0, 245.0, 613.0, 0.0, 30.0, 217.0, ..."
8,business,How to Build a Startup Ecosystem in Your ...,"{'all': 0.3912947246598535, ...","[8.0, 348.0, 53.0, 480.0, 380.0, 335.0, 510.0, ..."
9,business,Growth vs Profitability and Venture ReturnsThere ...,"{'all': 0.3912947246598535, ...","[9.0, 497.0, 470.0, 162.0, 514.0, 114.0, ..."


In [22]:
dataset.save('Articles_with_recommendations.csv',format='csv')

### Testing data for cluster assigning.

In [23]:
testset = sf.SFrame('KO_articles_test.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [24]:
test_tfidf= tfidfvec.transform(testset['text'])
# Predict Using model with 10 clusters.
test_predictions= predict_cluster(testset,model_em_10k)
test_predictions= np.array(test_predictions)
test_predictions

array([0, 1, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 5, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 5,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 2, 3, 2, 0, 0, 0, 2, 2, 2, 0, 5, 3,
       1, 2, 0, 2, 0])

In [25]:
# Predict Using model with 20 clusters.
test_predictions= predict_cluster(testset,model_em_20k)
test_predictions= np.array(test_predictions)
test_predictions

array([ 4,  1,  1,  1,  5,  1,  4,  4,  6, 13,  1,  4,  5, 15,  4,  5, 15,
        1,  6, 13,  6,  6,  6,  6,  6, 13,  4,  6,  6,  6,  6,  1,  6,  7,
       16,  6,  6,  6,  6,  6,  6, 11,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6, 13,  4,  6,  6,  6,  6,  1,  6,  7, 16,  6,  6,  6,  6,  6,  6,
       11,  6,  6,  6,  6,  6,  5,  6,  4,  1, 15,  1, 13,  1, 13,  4,  1,
        4,  1,  1, 13,  1, 13, 11,  6,  8,  1,  1, 15])