# Tag sequences clustering
Hierarchically cluster concepts by the tag sequences and see what clusters form. Do they form a natural ontological classification?     

## 1. Getting concept graph and data from disk

In [1]:
filename = 'Fundamental Concepts in Heterogeneous Catalysis'

concepts = {}
import pickle
# Loading extracted concepts from file (see concept_extraction.ipynb)
#concepts = {'sents':sents,'rawtxt':rawtxt,'sent_to_npflat':sent_to_npflat,'sent_to_tags':sent_to_tags,'sent_to_ltags':sent_to_ltags,'np_to_sent':np_to_sent,'Conceptdata':Conceptdata}
with open(filename +'concepts.pickle', 'rb') as f:
    concepts = pickle.load(f)

# Loading idf dictionary (see Build_IDF_dictionary.ipynb)
with open('idf_dict.pickle','rb') as f1:
    idf_dict =pickle.load(f1)

sents = concepts['sents']
rawtxt = concepts['rawtxt']
sent_to_npflat = concepts['sent_to_npflat']
sent_to_tags= concepts['sent_to_tags']
sent_to_ltags = concepts['sent_to_ltags']
np_to_sent = concepts['np_to_sent']
Conceptdata = concepts['Conceptdata']
inv_eqn_dict = concepts['inv_eqn_dict']

with open(filename +'conceptmap.pickle', 'rb') as f:
    concepts = pickle.load(f)

## 2. Extracting list of concepts:

### 2.1. Importing libraries

In [2]:
import nltk
from nltk import word_tokenize
from nltk.chunk import *
from nltk.chunk.util import *
from nltk.chunk.regexp import *
from nltk import Tree
import re
import pickle
import math
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))
import itertools
from itertools import chain
import collections
import numpy as num
import pandas as pd
import csv
import statistics
from nltk.corpus import cmudict
cmud = cmudict.dict()
wnl = nltk.WordNetLemmatizer()
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
with open('common_words_documents.pickle', 'rb') as f:
       common_words_documents = pickle.load(f)
f.close()
type(common_words_documents)

dict

In [3]:
def get_neighbour(wlist,wnp,wplace):
    npsplit = wnp.split()
    npsplit = [np.lower() for np in npsplit]
    wlist = [w.lower() for w in wlist]
    for tok in npsplit:
        if tok not in wlist:
            return 'not in list'
    
    if wplace>0:
        #print('in positive place')
        indnp = wlist.index(npsplit[-1])
        wind = indnp+wplace
        if wind<len(wlist):
            return wlist[wind]
        else:
            #print('exceeded length')
            return 'indexplus'+str(len(wlist)-wind)
    elif wplace<0:
        indnp = wlist.index(npsplit[0])
        wind = indnp+wplace
        if wind>=0:
            return wlist[wind]
        else:
            #print('dexceeded length')
            return 'indexminus'+str(abs(wind))
    else:
        return wnp

In [4]:
def get_prior_tag_sequence(np,s):
    # get other noun phrases in sentence
    othernp = [other.lower() for other in sent_to_npflat[s] if np!=other.lower()]
    othernplast = [other.split()[-1] for other in othernp]
    tokens = [tag[0].lower() for tag in sent_to_tags[s]]
    lemtags = sent_to_ltags[s]
    lemtokens = [ltags[0].lower() for ltags in lemtags]
    lempos = [ltags[1] for ltags in lemtags]
    flag = 0
    priorseq = []
    count = -1
    while flag==0: 
        neighbour = get_neighbour(lemtokens,np,count)
        if neighbour in othernplast:
            flag = 1
        elif 'indexminus' in neighbour:
            priorseq.append("START")
            flag=1
        else:
            priorseq.append(lempos[lemtokens.index(neighbour)])
            count = count-1
    priorseq.reverse()
    priorseqstring = 'PREV_'+'_'.join(priorseq)
    return priorseqstring

In [5]:
def get_next_tag_sequence(np,s):
    # get other noun phrases in sentence
    othernp = [other.lower() for other in sent_to_npflat[s] if np!=other.lower()]
    othernpfirst = [other.split()[0] for other in othernp]
    tokens = [tag[0].lower() for tag in sent_to_tags[s]]
    lemtags = sent_to_ltags[s]
    lemtokens = [ltags[0].lower() for ltags in lemtags]
    lempos = [ltags[1] for ltags in lemtags]
    flag = 0
    nextseq = []
    count = 1
    while flag==0: 
        neighbour = get_neighbour(lemtokens,np,count)
        if neighbour in othernpfirst:
            flag = 1
        elif 'indexplus' in neighbour:
            nextseq.append("END")
            flag=1
        else:
            nextseq.append(lempos[lemtokens.index(neighbour)])
            count = count+1
    nextseqstring = 'NEXT_'+'_'.join(nextseq)
    return nextseqstring
        

In [6]:
def add_axis(numarray):
    return numarray.reshape(len(numarray),1)

In [210]:
#allnp = list(np_to_sent.keys())
allnp = list(Conceptdata.loc[Conceptdata['Frequency']>10,'Concept'])
nplist = [np for np in list(allnp) for s in np_to_sent[np]]
oclist = [len(np_to_sent[np]) for np in list(allnp) for s in np_to_sent[np]]
slist = [s for np in allnp for s in np_to_sent[np]]
next_tag_seqlist = [get_next_tag_sequence(nplist[i],slist[i]) for i in range(len(slist))]
prev_tag_seqlist = [get_prior_tag_sequence(nplist[i],slist[i]) for i in range(len(slist))]

In [211]:
df = pd.DataFrame()
df['nplist']=nplist
df['oclist']=oclist
df['slist']=slist
df['prev_tag_seqlist']=prev_tag_seqlist
df['next_tag_seqlist']=next_tag_seqlist

In [212]:
df.to_csv('tag_cluster.csv')
df.head()

Unnamed: 0,nplist,oclist,slist,prev_tag_seqlist,next_tag_seqlist
0,process,38,0,PREV_CC,NEXT_VBG
1,process,38,11,PREV_WRB_DT,NEXT_VBP
2,process,38,33,"PREV_,_DT",NEXT_RB_VBZ_DT
3,process,38,294,PREV_START_DT,NEXT_VBG
4,process,38,319,PREV_IN,NEXT_VBG


In [213]:
dfsub= df[(~df['prev_tag_seqlist'].str.contains('NN')) & (~df['next_tag_seqlist'].str.contains('NN'))]
len(dfsub)

5097

In [215]:
df1 = dfsub.groupby('nplist')['prev_tag_seqlist'].value_counts(normalize=True).unstack().fillna(0)
df2 = dfsub.groupby('nplist')['next_tag_seqlist'].value_counts(normalize=True).unstack().fillna(0)
df3 = df1.merge(df2,how='left',left_index=True,right_index=True)
df4 = df3.merge(dfsub.loc[:,['nplist','oclist']].drop_duplicates().set_index('nplist'),how='left',left_index=True,right_index=True)

from sklearn.model_selection import train_test_split
df4train,df4test,y_train,y_test = train_test_split(df4,df4['oclist'],test_size=0.3)
print(df4train.shape,num.mean(y_train))
print(df4test.shape,num.mean(y_test))

(149, 2405) 31.221476510067113
(64, 2405) 24.625


In [193]:
# from scipy.cluster.hierarchy import dendrogram, linkage
# from matplotlib import pyplot as plt
# %matplotlib inline
# #num.set_printoptions(precision=5, suppress=True) # suppress scientific notation
# data = num.array(df3)
# Z = linkage(data,'ward')
# type(Z)
# #dendrogram(Z)

In [234]:
%%time 
from sklearn.cluster import AgglomerativeClustering
n_clust = 20
clustering = AgglomerativeClustering(n_clusters=n_clust).fit(num.array(df4train.iloc[:,0:-1]))

Wall time: 53.2 ms


In [235]:
labels = list(clustering.labels_)
trainlabels = pd.DataFrame()
trainlabels['nplist'] = list(df4train.iloc[:,0:-1].index)
trainlabels['labels'] = labels
testlabels = pd.DataFrame(list(df4test.iloc[:,0:-1].index))
testlabels['nplist'] = list(df4test.index)
testlabels['labels'] = clustering.fit_predict(num.array(df4test.iloc[:,0:-1]))

In [236]:
for i in range(n_clust):
    print('Label',i,'Training set:')
    print(trainlabels.loc[trainlabels['labels']==i,'nplist'])
    print('Test set:')
    print(testlabels.loc[testlabels['labels']==i,'nplist'])

Label 0 Training set:
22            f
57            k
76            c
84            v
99            a
101           i
107           r
125    instance
139           d
140           s
Name: nplist, dtype: object
Test set:
5     kinetic energy
11     reaction step
14           account
26           example
31              fact
57               way
Name: nplist, dtype: object
Label 1 Training set:
3        transition metal
7                    site
9             temperature
20                 oxygen
34                  water
51      chemical reaction
54           intermediate
55                     co
60          metal surface
65                     au
69                 specie
80             reactivity
81                    tst
90                     h2
100               bonding
102             free site
116                  htst
119             catalysis
121                  time
129    catalytic activity
142                    n2
144                    oh
Name: nplist, dtype: object
Test