## Clustering

Consider hierarchical methods. Since these should be able to extract concepts better.

In [1]:
print('------------------------------------------------------')
print('Step 4:  K Means ')
from datetime import datetime as dt
t_start = dt.now()
print('Starting at', t_start)
print('------------------------------------------------------')

------------------------------------------------------
Step 4:  K Means 
Starting at 2018-02-22 15:31:20.834306
------------------------------------------------------


In [2]:
# Import dependencies
import pickle
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from datetime import datetime as dt

In [3]:
# Configure
from config import Config as c
# set K parameter here:
n_clusters = c.n_clusters
# which cluster set to use?
# ['tsne', 'raw'] 
cluster_set = 'tsne'



# other global variables:
doi_datapath = c.dois_pkl
word_datapath = c.word_datapath
tfidf_datapath = c.tfidf_datapath
vectorizer_datapath = c.vectorizer_datapath
working_data =  c.working_data

# output paths
data_out = c.working_data
data_out_xl = c.kmeans_out_xl
data_out_csv = c.kmeans_out_csv

# load vector data
tfidf = pickle.load(open(tfidf_datapath,'rb'))
vectorizer = pickle.load(open(vectorizer_datapath,'rb'))

# global variable for ordering
dois = pickle.load(open(doi_datapath,'rb'))
data = pd.read_csv(working_data,index_col = 0)
cites = list(data.Citations)

In [4]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'can', 'from', 'un', 'she', 'must', 'become', 'not', 'yours', 'somehow', 'take', 'well', 'yourself', 'back', 'them', 'bottom', 'besides', 'thin', 'it', 'give', 'am', 'hereupon', 'less', 'use', 'cant', 'therefore', 'than', 'eight', 'please', 'ltd', 'in', 'nobody', 'ever', 'made', 'mill', ...e', 'full', 'both', 'nor', 'former', 'else', 'six', 'though', 'none', 'anything', 'myself', 'those'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

#### Clustering AFTER dimensional reduction
1. get which articles are in which cluster
2. sum the tfidf for the articles in each group
3. argsort the sum for each cluster
4. use the argsort to pick keywords from vectorizer.get_feature_names()

In [5]:
# cluster t-sne data
tsne_data = np.matrix(data[['TSNE1','TSNE2']])
tsne_kmeans = KMeans(n_clusters=n_clusters, ).fit(tsne_data) 
tsne_centers = tsne_kmeans.cluster_centers_ # centers of each cluster
tsne_klabels = tsne_kmeans.labels_ # cluster numbers for each paper

In [6]:
tsne_klabels, np.shape(tfidf)

(array([19, 20,  8, ..., 22, 11, 19]), (18355, 5000))

Work out the most significant words in each cluster and add them as cluster keywords so that we know what each cluster is about.

In [7]:
np.shape(tsne_data),np.shape(tsne_centers), tsne_kmeans

((18355, 2),
 (32, 2),
 KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
     n_clusters=32, n_init=10, n_jobs=1, precompute_distances='auto',
     random_state=None, tol=0.0001, verbose=0))

In [8]:
dct = {}
mat = tfidf.todense() # this step kills it if dataset too large
i = 0
for label in tsne_klabels: # labels are cluster numbers
    if label in dct:
        # add together all rows with the same cluster number
        dct[label] = dct[label] + mat[i,:]
    else:
        # initialise if there's nothing to add to
        dct[label] = mat[i,:]
    i+=1
    
dct

{0: matrix([[ 0.        , 10.76698098, 10.76698098, ...,  0.36999283,
           0.        ,  0.        ]]),
 1: matrix([[0.        , 6.02377684, 6.02377684, ..., 0.        , 0.        ,
          0.        ]]),
 2: matrix([[ 0.27143376, 11.2702177 , 11.2369532 , ...,  0.48892425,
           0.        ,  0.        ]]),
 3: matrix([[ 0.13694364, 10.73677654, 10.73677654, ...,  0.        ,
           0.        ,  0.        ]]),
 4: matrix([[0.        , 7.43247377, 7.38395958, ..., 0.12079401, 0.        ,
          0.        ]]),
 5: matrix([[ 0.13791127, 20.01039614, 19.99343808, ...,  0.12816996,
           0.        ,  0.        ]]),
 6: matrix([[0.        , 9.24366064, 9.22675938, ..., 0.        , 0.        ,
          0.        ]]),
 7: matrix([[0.        , 8.6865698 , 8.65316755, ..., 0.        , 0.        ,
          0.        ]]),
 8: matrix([[ 0.        , 10.88704835, 10.87118197, ...,  0.11991409,
           0.        ,  0.        ]]),
 9: matrix([[0.        , 5.11034699, 5.0782

In [38]:
vectorizer.get_feature_names()

['aaa',
 'abdomin',
 'abdomin aortic',
 'aberr',
 'abil',
 'ablat',
 'ablat atrial',
 'ablat background',
 'ablat cathet',
 'ablat patient',
 'ablat procedur',
 'ablat treatment',
 'abnorm',
 'absenc',
 'absolut',
 'absorb',
 'ac',
 'academ',
 'acc',
 'acc surgic',
 'acceler',
 'access',
 'access site',
 'accessori',
 'accord',
 'accumul',
 'accur',
 'accuraci',
 'ace',
 'ace gene',
 'achiev',
 'acid',
 'acidosi',
 'acquir',
 'act',
 'action',
 'action background',
 'action potenti',
 'activ',
 'activ associ',
 'activ background',
 'activ human',
 'activ patient',
 'activ protein',
 'activ receptor',
 'acut',
 'acut aortic',
 'acut cardiovascular',
 'acut chest',
 'acut chronic',
 'acut coronari',
 'acut decompens',
 'acut effect',
 'acut elbow',
 'acut elev',
 'acut heart',
 'acut ischaem',
 'acut ischem',
 'acut kidney',
 'acut limb',
 'acut myocardi',
 'acut pulmonari',
 'acut respiratori',
 'acut segment',
 'acut stroke',
 'acut type',
 'ad',
 'adamt',
 'adapt',
 'add',
 'addit',
 

In [9]:
tsne_terms = vectorizer.get_feature_names() # gets the words from the vocabulary
tsne_terms_df =[]
for i in range(n_clusters):
    ls_ = []
    # get the indexes of the top 10 terms in each cluster
    indexes = list(np.array(dct[i].argsort()[:, ::-1][0])[0])[:10] # this conversion is not very neat.
#     print(indexes)
    for ind in indexes:
        ls_.append(tsne_terms[int(ind)]) 
    tsne_terms_df.append(ls_[:3])
    
# show first 3 cluster labels
tsne_terms_df[:3]

[['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas']]

In [10]:
# quick fix
klabels = tsne_klabels

In [11]:
dois_ls = []
for i in range(n_clusters):
    # indices of papers that fall within ith cluster
    indices = [j for j, x in enumerate(klabels) if x ==i]
    # print(indices)
    doi_ls = [dois[j][:7] + '/' + dois[j][8:] 
             for j in indices]
    dois_ls.append(doi_ls)

# len(links_ls)

### Choose which clusters to add to your dataframe
This is not the neatest solution to this.  Do something better.

This part is a placeholder for later.  You potentially get better results by using higher dimensionality data, so KMeans on that data might be a worthwhile option.  Note that it is very slow and makes the final visualisation look scruffy.

In [12]:
if cluster_set == 'tsne':
    terms_df = tsne_terms_df
    centers =  tsne_centers
    klabels = tsne_klabels
    kmeans = tsne_kmeans

## Find the best cited clusters

In [13]:
## try dropping the top x percentile of papers 

nz_cites_ls = []
mean_cites_ls =[]
cites_ls_ls = []
mean_outlier_ls = []
for j in range(n_clusters):
    # get the index for each paper in jth cluster
    cluster_indices = [i for i, x in enumerate(klabels) if x ==j]
    # get the citations for those indices
    cluster_cites = [cites[i] 
                    if np.isnan(cites[i])==False 
                    else 0
                    for i in cluster_indices] # switching NaNs to zeros in order to ensure following calculations work
    
    # count non-zero cites for each cluster
    nz_cites = float(np.count_nonzero(np.array(cluster_cites)))#/len(cluster_cites))
    nz_cites_ls.append(nz_cites) # list of nz cite counts for all clusters 
    
    # exclude outliers
    # ptl80 = np.percentile(cluster_cites, 80)
#     n_outliers = 2
#     cluster_exc_max = [cites[i]
#                     if np.isnan(cites[i])==False 
#                     else 0
#                     for i in cluster_indices] # switching NaNs to zeros in order to ensure following calculations work
#     for i in range(n_outliers):
#         max_cite = max(cluster_exc_max)
#         cluster_exc_max.remove(max_cite) 
#     mean_outlier = np.mean(cluster_exc_max)
#     mean_outlier_ls.append(mean_outlier)
                     
    # work out mean citations of each cluster
    mean_cites = np.mean(np.array(cluster_cites))
    mean_cites_ls.append(mean_cites)
    
    # add the list of citations for each item in the cluster
    cites_ls_ls.append(cluster_cites)


Create a dataframe at a cluster level

In [14]:
df = pd.DataFrame({
    'Cluster': terms_df,
    'nz_cites': nz_cites_ls,
    'mean_cites':mean_cites_ls,
    'cites':cites_ls_ls,
#     'mean_outlier':mean_outlier_ls,  
    'dois_ls':dois_ls
})

In [15]:
df['len_cites'] = [len(i) for i in df['cites']]

In [16]:
# df.sort_values('mean_outlier', ascending=False)

In [17]:
df['Cluster_no'] = df.index

In [18]:
df['nz_pc'] = df['nz_cites']/df['len_cites']

In [19]:
r = df['nz_cites'].hist(alpha=0.3)
r

<matplotlib.axes._subplots.AxesSubplot at 0x16487a48c88>

In [20]:
# %matplotlib inline
s = df['nz_pc'].hist(alpha=0.3)
s

<matplotlib.axes._subplots.AxesSubplot at 0x16487a48c88>

In [21]:

# p = df['mean_outlier'].hist(alpha=0.3)
# q = df['mean_cites'].hist(alpha=0.3)
# p,q

In [22]:
dois[:3]

['10.1016/j.ahj.2017.04.004',
 '10.1016/j.ahj.2017.08.014',
 '10.1016/j.ahj.2017.08.004']

In [23]:
doi_cluster_dict = {}
doi_clusterno_dict = {}
for index,row in df.iterrows():
    doi_ls = row.dois_ls
    for doi in doi_ls:
        doi_cluster_dict[doi] = row.Cluster
        doi_clusterno_dict[doi] = row.Cluster_no

cluster_ls = [str(
    doi_cluster_dict[
        doi[:7]+'/'+doi[8:]
    ]) for doi in dois]

clusterno_ls = [doi_clusterno_dict[doi[:7]+'/'+doi[8:]] for doi in dois]

len(clusterno_ls),len(cluster_ls), len(dois)

(18355, 18355, 18355)

In [24]:
data

Unnamed: 0,DI,PY,TI,AB,WD,AU,EM,AF,SO,SC,...,highly_cited_1,highly_cited_10,highly_cited_5,recent_citations,relative_citation_ratio,times_cited,Citations,Cluster,Cluster_no,Article_kws
0,10.1016/j.ahj.2017.04.004,2017,Late outcome of percutaneous mitral commissuro...,Background Late prognosis after successful per...,Late outcome of percutaneous mitral commissuro...,"Lee, S; Kang, DH; Kim, DH; Song, JM; Song, JK;...",dhkang@amc.seoul.kr; sjpark@amc.seoul.kr,"Lee, Sahmin; Kang, Duk-Hyun; Kim, Dae-Hee; Son...",AMERICAN HEART JOURNAL,Cardiovascular System & Cardiology,...,False,False,False,0.0,,0.0,0.0,"['background', 'arteri', 'diseas']",15,"['background', 'arteri', 'diseas']"
1,10.1016/j.ahj.2017.08.014,2017,A unique linkage of administrative and clinica...,"Background Large clinical, research, and admin...",A unique linkage of administrative and clinica...,"Godown, J; Thurm, C; Dodd, DA; Soslow, JH; Fei...",justin.godown@vanderbilt.edu,"Godown, Justin; Thurm, Cary; Dodd, Debra A.; S...",AMERICAN HEART JOURNAL,Cardiovascular System & Cardiology,...,False,False,False,1.0,,1.0,1.0,"['background', 'arteri', 'diseas']",5,"['background', 'arteri', 'diseas']"
2,10.1016/j.ahj.2017.08.004,2017,Contemporary risk model for inhospital major b...,Background Major bleeding is a frequent compli...,Contemporary risk model for inhospital major b...,"Desai, NR; Kennedy, KF; Cohen, DJ; Connolly, T...",robert.mcnamara@yale.edu,"Desai, Nihar R.; Kennedy, Kevin F.; Cohen, Dav...",AMERICAN HEART JOURNAL,Cardiovascular System & Cardiology,...,False,False,False,0.0,,0.0,0.0,"['background', 'arteri', 'diseas']",30,"['background', 'arteri', 'diseas']"
3,10.1016/j.ahj.2017.08.013,2017,Contemporary rates and correlates of statin us...,Background Statin therapy ishighly efficacious...,Contemporary rates and correlates of statin us...,"Go, AS; Fan, DJ; Sung, SH; Inveiss, AI; Romo-L...",Alan.S.Go@kp.org,"Go, Alan S.; Fan, Dongjie; Sung, Sue Hee; Inve...",AMERICAN HEART JOURNAL,Cardiovascular System & Cardiology,...,False,False,False,0.0,,0.0,0.0,"['background', 'arteri', 'diseas']",12,"['background', 'arteri', 'diseas']"
4,10.1016/j.ahj.2017.08.006,2017,Durability of quality of life benefits of tran...,Background For patients with severe aortic ste...,Durability of quality of life benefits of tran...,"Baron, SJ; Arnold, SV; Reynolds, MR; Wang, KJ;...",dcohen@saint-lukes.org,"Baron, Suzanne J.; Arnold, Suzanne V.; Reynold...",AMERICAN HEART JOURNAL,Cardiovascular System & Cardiology,...,False,False,False,0.0,,0.0,0.0,"['background', 'arteri', 'diseas']",7,"['background', 'arteri', 'diseas']"
5,10.1016/j.ahj.2017.08.016,2017,Atrial fibrillation decision support tool: Pop...,Background Appropriate thromboprophylaxis for ...,Atrial fibrillation decision support tool: Pop...,"Eckman, MH; Costea, A; Attari, M; Munjal, J; W...",mark.eckman@uc.edu,"Eckman, Mark H.; Costea, Alexandru; Attari, Me...",AMERICAN HEART JOURNAL,Cardiovascular System & Cardiology,...,False,False,False,1.0,,1.0,1.0,"['background', 'arteri', 'diseas']",11,"['background', 'arteri', 'diseas']"
6,10.1016/j.ahj.2017.08.009,2017,Outcomes in elderly and young patients with ST...,Background Since older age is a strong predict...,Outcomes in elderly and young patients with ST...,"Qaderdan, K; Vos, GJA; McAndrew, T; Steg, PG; ...",jurtenberg@gmail.com,"Qaderdan, Khalid; Vos, Gerrit-Jan A.; McAndrew...",AMERICAN HEART JOURNAL,Cardiovascular System & Cardiology,...,False,False,False,0.0,,0.0,0.0,"['background', 'arteri', 'diseas']",0,"['background', 'arteri', 'diseas']"
7,10.1016/j.ahj.2017.08.015,2017,"""Bringing on the light"" in a complex clinical ...",Background Cancer patients with recently place...,"""Bringing on the light"" in a complex clinical ...","Iliescu, CA; Cilingiroglu, M; Giza, DE; Rosale...",ciliescu@mdanderson.org,"Iliescu, Cezar A.; Cilingiroglu, Mehmet; Giza,...",AMERICAN HEART JOURNAL,Cardiovascular System & Cardiology,...,False,False,False,0.0,,0.0,0.0,"['background', 'arteri', 'diseas']",26,"['background', 'arteri', 'diseas']"
8,10.1016/j.ahj.2017.08.019,2017,Outcomes of cardiac pacing in adult patients a...,Background Cardiac pacing can be challenging a...,Outcomes of cardiac pacing in adult patients a...,"Egbe, AC; Huntley, GD; Connolly, HM; Ammash, N...",egbe.alexander@mayo.edu,"Egbe, Alexander C.; Huntley, Geoffery D.; Conn...",AMERICAN HEART JOURNAL,Cardiovascular System & Cardiology,...,False,False,False,0.0,,0.0,0.0,"['background', 'arteri', 'diseas']",11,"['background', 'arteri', 'diseas']"
9,10.1016/j.ahj.2017.08.017,2017,Edoxaban for the management of elderly Japanes...,Edoxaban-a non-vitamin K antagonist oral antic...,Edoxaban for the management of elderly Japanes...,"Okumura, K; Lip, GYH; Akao, M; Tanizawa, K; Fu...",okumura@hirosaki-u.ac.jp,"Okumura, Ken; Lip, Gregory Y. H.; Akao, Masaha...",AMERICAN HEART JOURNAL,Cardiovascular System & Cardiology,...,False,False,False,0.0,,0.0,0.0,"['background', 'arteri', 'diseas']",25,"['background', 'arteri', 'diseas']"


In [25]:
article_kws_ls= []

for row in mat:
    indexes = list(np.array(row.argsort()[:, ::-1][0])[0])[:10]
    article_kws_ls.append([tsne_terms[ind] for ind in indexes][:3])

In [26]:
article_kws_ls

[['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'orbit'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'wave'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'main'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['background', 'arteri', 'diseas'],
 ['bac

In [27]:
data['Cluster'] = cluster_ls
data['Cluster_no'] = clusterno_ls
data['Article_kws'] = article_kws_ls

In [28]:
# data.head()

### Write data to file

In [29]:
data.to_csv(data_out)
df.to_excel(data_out_xl)
df.to_csv(data_out_csv)

In [30]:
s = str(dt.now()-t_start)
print('Done in '+s)

Done in 0:00:54.030703
